<center>
    <p style="text-align:center">
        <img alt="phoenix logo" src="https://storage.googleapis.com/arize-phoenix-assets/assets/phoenix-logo-light.svg" width="200"/>
        <br>
        <a href="https://arize.com/docs/phoenix/">Docs</a>
        |
        <a href="https://github.com/Arize-ai/phoenix">GitHub</a>
        |
        <a href="https://arize-ai.slack.com/join/shared_invite/zt-2w57bhem8-hq24MB6u7yE_ZF_ilOYSBw#/shared-invite/email">Community</a>
    </p>
</center>
<h1 align="center">LLM as a Judge 102: Meta-Evaluation </h1>


The purpose of this notebook is apart of the Evals Best Practices Series, Episode 6: LLM as a Judge 102: Meta-Evaluation. 
This notebook will go through the process of Meta Evaluation, the process of evaluating your evaluator.

>
> ##### Note: This notebook was last updated on Dec 10, 2025. 
>

##### Install Dependencies and Import Libraries

In [None]:
%pip install -q arize-phoenix openai getpass

##### Initiate the Tracer Provider to Auto Instrument our Application

In [None]:
from phoenix.otel import register 
import os 
from openai import AsyncOpenAI
from getpass import getpass 

tracer_provider = register(auto_instrument=True)

if not (openai_api_key := os.getenv("OPENAI_API_KEY")):
    openai_api_key = getpass("ðŸ”‘ Enter your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = openai_api_key

openai_client = AsyncOpenAI()

## Step 1: Prepare your dataset

#### Import CSV data in

In [None]:
import pandas as pd 

full_df = pd.read_csv('TruthfulQA.csv')
print(f"Dataset shape: {full_df.shape}")
print(f"\nColumns: {full_df.columns.tolist()}")
full_df.head(1)

#### Take in 250 total random samples

In [None]:
df = full_df.sample(n=250)
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head(1)

#### Create a 80/20 split for our Dev & Test set

In [None]:
split = int(.8 * len(df))
dev_df = df.iloc[:split]
test_df = df.iloc[split:]
print(f"Dataset shape: {dev_df.shape}")
print(f"Dataset shape: {test_df.shape}")


#### Set your Data to follow a 75/25 Correct/Incorrect Setup

In [None]:
import random

def correct_incorrect_split(df):
    df_copy = df.copy()
    n_total = len(df_copy)
    n_correct = int(0.75 * n_total)
    labels = ['Correct'] * n_correct + ['Incorrect'] * (n_total - n_correct)
    random.shuffle(labels)
    df_copy['Ground Truth'] = labels
    return df_copy

def add_answer(df):
    df_copy = correct_incorrect_split(df)
    df_copy['Answer'] = df_copy.apply(
        lambda row: row['Correct Answers'] if row['Ground Truth'] == 'Correct' 
        else row['Incorrect Answers'] if row['Ground Truth'] == 'Incorrect'
        else None,
        axis=1
    )
    df_copy = df_copy[['Type', 'Category', 'Question', 'Answer', 'Ground Truth', 'Source', 'Correct Answers', 'Incorrect Answers', 'Best Answer']]
    
    return df_copy

test = add_answer(test_df)
dev = add_answer(dev_df)
test.head(1)

#### Send your Datasets to Phoenix

In [None]:
from phoenix.client import AsyncClient

client = AsyncClient()

test_dataset = await client.datasets.create_dataset(
    dataframe=test,
    name="test-dataset",
    input_keys=["Question", "Answer"],
    output_keys=["Ground Truth"]
)

dev_dataset = await client.datasets.create_dataset(
    dataframe=dev,
    name="dev-dataset",
    input_keys=["Question", "Answer"],
    output_keys=["Ground Truth"]
)

#### Create our experiment to run
##### Define your Task & evaluators 

In [None]:
from phoenix.evals import create_classifier
from phoenix.evals.llm import LLM

async def base_task(example) -> str: 
    base_qa_prompt = """
        You are given a question and an answer.
        {input}
        Return only the label "Correct" or "Incorrect"
    """

    base_qa_eval = create_classifier(
        name="base_qa_eval",
        prompt_template=base_qa_prompt,
        llm=LLM(provider="openai", model="gpt-4"),
        choices={"Correct": 1, "Incorrect": 0},
    )

    eval_result = await base_qa_eval.async_evaluate(example)
    score = eval_result[0]

    return {"label": score.label, "score": score.score, "explanation": score.explanation} 

async def exact_match(example, output) -> float:
    return 1.0 if output["label"] in example.output["Ground Truth"] else 0.0

evaluators = [exact_match]

#### Run your Experiments

In [None]:
from phoenix.client.experiments import async_run_experiment
dev_base_experiment = await async_run_experiment(
    dataset=dev_dataset,
    task=base_task,
    evaluators=evaluators,
    experiment_name="base task",
    client=client,
    repetitions=1,
)

test_base_experiment = await async_run_experiment(
    dataset=test_dataset,
    task=base_task,
    evaluators=evaluators,
    experiment_name="new base task",
    client=client,
    repetitions=1,
)

## Step 2: Calculate Metrics

#### Compare your human and LLM judgements


In [None]:
def get_experiment_results(experiment, dataset):
    task_runs = pd.DataFrame([run for run in experiment['task_runs']])[['dataset_example_id', 'output']].rename(columns={'output': 'output_dict'})
    task_runs[['label', 'score', 'explanation']] = (task_runs['output_dict'].apply(lambda x: pd.Series(x) if isinstance(x, dict) else pd.Series([None, None, None])))
    return task_runs.merge(pd.DataFrame(dataset), left_on='dataset_example_id', right_on='id', how='right')

#### Calculate classification metrics & Plot a confusion matrix


In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

base_dev_df = get_experiment_results(dev_base_experiment, dev_dataset)

# Map Ground Truth to binary labels (1 for 'Correct Answers', 0 for 'Incorrect Answers')
y_true = base_dev_df['output'].apply(lambda x: 1 if x['Ground Truth'] == 'Correct' else 0)
y_pred = base_dev_df['score'] 

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=['Incorrect', 'Correct']))

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Incorrect', 'Correct'], 
            yticklabels=['Incorrect', 'Correct'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## Step 3: Inspect Results
##### See the examples where the eval did not match the ground truth. Looking at the explanations can help provide insight into changes to make to the evaluator prompt. You can do this either in code or in the Phoenix UI. 

![ui](https://storage.googleapis.com/arize-phoenix-assets/assets/images/phoenix-docs-images/experiment_explanatiosn.png)

In [None]:
ground_truth = base_dev_df['output'].apply(lambda x: x['Ground Truth'])
mismatched_df = base_dev_df[base_dev_df['label'] != ground_truth]

info = mismatched_df[['input', 'output', 'label', 'explanation']]
info.head()

## Step 4: Iterate and Improve
##### Time for Improvements - Tweak your prompt, model, or criteria based on the results

In [None]:
async def human_task(example) -> str: 
    human_qa_prompt = """
        You are given a question and an answer. Determine whether the answer correctly and fully answers 
        the question based on the information contained in the answer itself. If the question can have multiple answers, 
        and the answer provided is one of them, classify it as correct. Take the question at face value, 
        and then look at the answer to determine if the answer is correct or incorrect.
        {input}
        Return only the label "Correct" or "Incorrect"
    """

    human_qa_eval = create_classifier(
        name="human_qa_eval",
        prompt_template=human_qa_prompt,
        llm=LLM(provider="openai", model="gpt-4"),
        choices={"Correct": 1, "Incorrect": 0},
    )

    eval_result = await human_qa_eval.async_evaluate(example)
    score = eval_result[0]

    return {"label": score.label, "score": score.score, "explanation": score.explanation} 

#### Run new + old against the test set for a final comparison


In [None]:
dev_human_experiment = await async_run_experiment(
    dataset=dev_dataset,
    task=human_task,
    evaluators=evaluators,
    experiment_name="human task",
    client=client,
    repetitions=1,
)

test_human_experiment = await async_run_experiment(
    dataset=test_dataset,
    task=human_task,
    evaluators=evaluators,
    experiment_name="human task",
    client=client,
    repetitions=1,
)

#### Calculate classification metrics & Plot a confusion matrix

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

human_dev_df = get_experiment_results(dev_human_experiment, dev_dataset)

# Map Ground Truth to binary labels (1 for 'Correct Answers', 0 for 'Incorrect Answers')
y_true = human_dev_df['output'].apply(lambda x: 1 if x['Ground Truth'] == 'Correct' else 0)
y_pred = human_dev_df['score'] 

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=['Incorrect', 'Correct']))

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Incorrect', 'Correct'], 
            yticklabels=['Incorrect', 'Correct'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

### Bonus!

##### Let's test out a Meta Eval - pass your eval through an LLM and ask it to output an improved version. 
##### First, Define your prompt for this improvement

In [None]:
base_qa_prompt = """
    You are given a question and an answer.
    {input}
    Return only the label "Correct" or "Incorrect"
"""

meta_eval_prompt = f"""
You are an expert in prompt optimization. You will be given the original baseline prompt and the following associated metadata 
(such as model inputs, outputs, evaluation labels and explanations). Your task is to generate a revised version of the original 
prompt that would likely improve results with respect to the evaluation labels.
Your goal is to align the prompt with the feedback and evaluation criteria. Look at every example in the data. 
You may want to evaluate the explanation for clarity, correctness, completeness, and alignment with the intent of the original 
prompt. Identify weaknesses such as ambiguity, missing steps, unnecessary text, or faulty reasoning. 
Produce a much more detailed & improved version of the baseline prompt. 

The baseline prompt is: {base_qa_prompt}

The data to take into account is: {info}

Your Goal: Iterate on the original prompt (above) with a new, better prompt that will improve the results, based on the examples and feedback above.
Your Output: Return only the new prompt.
"""

In [None]:
resp = await openai_client.chat.completions.create(model="gpt-4o-mini", messages=[{"role": "user", "content": meta_eval_prompt}])
content = resp.choices[0].message.content.strip()
new_meta_prompt = content
new_meta_prompt

##### Copy your updated prompt in the function below as `meta_qa_prompt`

In [None]:
async def meta_task(example) -> str: 
    meta_qa_prompt = """
        You are tasked with evaluating whether the provided answer accurately addresses the question posed. 
        For each input, which includes a \'Question\' and an \'Answer\', please consider the following guidelines 
        to determine if the answer is correct:

        1. Assess the question and the answer in relation to known facts or common understanding on the subject matter. 
        2. Identify if the answer explicitly addresses the question or misses critical aspects.
        3. Consider whether multiple interpretations of the question or answer exist and evaluate their correctness based on context.

        Your response should strictly consist of the label "Correct" if the answer is accurate and 
        properly addresses the question, or "Incorrect" if it is not.

        Please analyze the following input: \n{input}'
    """

    meta_qa_eval = create_classifier(
        name="meta_qa_eval",
        prompt_template=meta_qa_prompt,
        llm=LLM(provider="openai", model="gpt-4"),
        choices={"Correct": 1, "Incorrect": 0},
    )

    eval_result = await meta_qa_eval.async_evaluate(example)
    score = eval_result[0]

    return {"label": score.label, "score": score.score, "explanation": score.explanation} 

##### Run your Experiment

In [None]:
dev_meta_experiment = await async_run_experiment(
    dataset=dev_dataset,
    task=meta_task,
    evaluators=evaluators,
    experiment_name="meta task",
    client=client,
    repetitions=1,
)

test_meta_experiment = await async_run_experiment(
    dataset=test_dataset,
    task=meta_task,
    evaluators=evaluators,
    experiment_name="meta task",
    client=client,
    repetitions=1,
)

#### Calculate classification metrics & Plot a confusion matrix

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

meta_dev_df = get_experiment_results(dev_meta_experiment, dev_dataset)

# Map Ground Truth to binary labels (1 for 'Correct Answers', 0 for 'Incorrect Answers')
y_true = meta_dev_df['output'].apply(lambda x: 1 if x['Ground Truth'] == 'Correct' else 0)
y_pred = meta_dev_df['score'] 

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=['Incorrect', 'Correct']))

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Incorrect', 'Correct'], 
            yticklabels=['Incorrect', 'Correct'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

### Look at your Improvements
![](https://storage.googleapis.com/arize-phoenix-assets/assets/images/phoenix-docs-images/meta_eval_102.png)