## Evaluators

This notebook tests the ergonomics of `LLMEvaluator` and `run_evals`.

In [None]:
import nest_asyncio
import pandas as pd
from phoenix.evals import HallucinationEvaluator, OpenAIModel, RelevanceEvaluator, run_evals

nest_asyncio.apply()

In [None]:
import os
from getpass import getpass

import openai

if not (openai_api_key := os.getenv("OPENAI_API_KEY")):
    openai_api_key = getpass("🔑 Enter your OpenAI API key: ")
openai.api_key = openai_api_key
os.environ["OPENAI_API_KEY"] = openai_api_key

In [None]:
model = OpenAIModel(model="gpt-4")
relevance_evaluator = RelevanceEvaluator(model=model)
hallucination_evaluator = HallucinationEvaluator(model=model)

In [None]:
dataframe = pd.DataFrame(
    [
        {
            "input": "What is the capital of California?",
            "reference": "Sacramento is the capital of California.",
            "output": "Sacramento",
        },
        {
            "input": "What is the capital of California?",
            "reference": "Carson City is the Capital of Nevada.",
            "output": "Carson City",
        },
    ]
)

With explanations, with function calling.

In [None]:
eval_dfs = run_evals(
    dataframe,
    [relevance_evaluator, hallucination_evaluator],
    provide_explanation=True,
    use_function_calling_if_available=True,
)

In [None]:
eval_dfs[0]

In [None]:
eval_dfs[1]

With explanations, without function calling.

In [None]:
eval_dfs = run_evals(
    dataframe,
    [relevance_evaluator, hallucination_evaluator],
    provide_explanation=True,
    use_function_calling_if_available=False,
)

In [None]:
eval_dfs[0]

In [None]:
eval_dfs[1]

Without explanations, with function calling.

In [None]:
eval_dfs = run_evals(
    dataframe,
    [relevance_evaluator, hallucination_evaluator],
    provide_explanation=False,
    use_function_calling_if_available=True,
)

In [None]:
eval_dfs[0]

In [None]:
eval_dfs[1]

Without explanations, without function calling.

In [None]:
eval_dfs = run_evals(
    dataframe,
    [relevance_evaluator, hallucination_evaluator],
    provide_explanation=False,
    use_function_calling_if_available=False,
)

In [None]:
eval_dfs[0]

In [None]:
eval_dfs[1]