# Evaluations

This notebook shows how to pull traces from a running phoenix instance and evaluate them using the `arize-phoenix-evals` library.

In [None]:
!pip install "arize-phoenix[evals]" openai nest_asyncio

In [None]:
# Run async evaluation in the notebook
import nest_asyncio

nest_asyncio.apply()

In [None]:
import phoenix as px

client = px.Client(endpoint="http://localhost:6006")

In [None]:
from datetime import datetime, timedelta
from phoenix.trace.dsl.helpers import get_qa_with_reference, get_retrieved_documents

qa_df = get_qa_with_reference(client)
documents_df = get_retrieved_documents(client)

In [None]:
qa_df.head()

In [None]:
documents_df.head()

In [None]:
## Evaluate Retrieval

from phoenix.evals import (
    OpenAIModel,
    RelevanceEvaluator,
    run_evals,
)

relevance_evaluator = RelevanceEvaluator(OpenAIModel(model="gpt-4-turbo-preview"))

relevance_evals = run_evals(
    evaluators=[relevance_evaluator],
    dataframe=documents_df,
    provide_explanation=True,
    concurrency=20,
)[0]

In [None]:
relevance_evals.head()

In [None]:
## Evaluate Responses

from phoenix.evals import (
    OpenAIModel,
    QAEvaluator,
    HallucinationEvaluator,
    run_evals,
)

qa_evaluator = QAEvaluator(OpenAIModel(model="gpt-4-turbo-preview"))
hallucination_evaluator = HallucinationEvaluator(OpenAIModel(model="gpt-4-turbo-preview"))

qa_evals, hallucination_evals = run_evals(
    evaluators=[qa_evaluator, hallucination_evaluator],
    dataframe=qa_df,
    provide_explanation=True,
    concurrency=20,
)

In [None]:
from phoenix.trace import DocumentEvaluations, SpanEvaluations

# Log the evaluations back to
client.log_evaluations(DocumentEvaluations(dataframe=relevance_evals, eval_name="relevance"),
                       SpanEvaluations(dataframe=qa_evals, eval_name="qa"),
                       SpanEvaluations(dataframe=hallucination_evals, eval_name="hallucination"))