In [None]:
from pathlib import Path
from secrets import token_hex

import groq
import nest_asyncio
import openai
import pandas as pd
from dotenv import load_dotenv
from openinference.instrumentation.groq import GroqInstrumentor
from openinference.instrumentation.openai import OpenAIInstrumentor
from sklearn.metrics import accuracy_score

import phoenix as px
from phoenix.client.utils import to_chat_messages_and_kwargs
from phoenix.experiments import run_experiment
from phoenix.otel import register

nest_asyncio.apply()
if (env_file := Path.home() / ".env").exists():
    load_dotenv(env_file)

In [None]:
tracer_provider = register()
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)
GroqInstrumentor().instrument(tracer_provider=tracer_provider)

In [None]:
url = "https://raw.githubusercontent.com/RUCAIBox/HaluEval/refs/heads/main/data/qa_data.json"
qa = pd.read_json(url, lines=True)
qa.sample(5).iloc[:, ::-1]

In [None]:
k = qa.iloc[:, :2]
df = pd.concat(
    [
        pd.concat([k, qa.iloc[:, 2].rename("answer")], axis=1).assign(true_label="factual"),
        pd.concat([k, qa.iloc[:, 3].rename("answer")], axis=1).assign(true_label="hallucinated"),
    ]
)
df = df.sample(10, random_state=42).reset_index(drop=True).iloc[:, ::-1]
df

# Get Prompt

https://github.com/Arize-ai/phoenix/blob/390cfaa42c5b2c28d3f9f83fbf7c694b8c2beeff/packages/phoenix-evals/src/phoenix/evals/default_templates.py#L56

In [None]:
prompt = px.Client().prompts.get(prompt_identifier="1")

# GPT 4o Mini

In [None]:
def openai_eval(input):
    messages, kwargs = to_chat_messages_and_kwargs(prompt, variables=dict(input))
    response = openai.OpenAI().chat.completions.create(messages=messages, **kwargs)
    return {"label": response.choices[0].message.content}

In [None]:
messages, kwargs = to_chat_messages_and_kwargs(prompt)
response = openai.OpenAI().chat.completions.create(messages=messages, **kwargs)

In [None]:
prompt

In [None]:
kwargs

### DataFrame Apply

In [None]:
gpt_result = pd.concat([pd.json_normalize(df.apply(openai_eval, axis=1)), df.true_label], axis=1)
print(f"Accuracy: {accuracy_score(gpt_result.true_label, gpt_result.label) * 100:.0f}%")
gpt_result

# Upload Dataset

In [None]:
ds = px.Client().upload_dataset(
    dataframe=df,
    dataset_name="hallu-eval-" + token_hex(),
    input_keys=["question", "knowledge", "answer"],
    output_keys=["true_label"],
)

# Run Experiment

In [None]:
run_experiment(ds, openai_eval)

# DeepSeek via Groq

In [None]:
async def groq_eval(input, model="deepseek-r1-distill-llama-70b"):
    messages, *_ = to_chat_messages_and_kwargs(prompt, variables=dict(input))
    response = await groq.AsyncGroq().chat.completions.create(messages=messages, model=model)
    return {"label": response.choices[0].message.content}

### Run Experiment

In [None]:
exp = run_experiment(ds, groq_eval)

### Extract the Last Word to Calculate Accuracy

In [None]:
labels = pd.json_normalize(exp.as_dataframe().output).label.str.split("\n").str[-1]
result = pd.concat([labels, df.true_label], axis=1)
print(f"Accuracy: {accuracy_score(result.true_label, result.label) * 100:.0f}%")
result

In [None]:
pd.concat([gpt_result.label.rename("gpt"), result.rename({"label": "deepseek"}, axis=1)], axis=1)