In [None]:
%pip install -Uqqq arize-phoenix-client arize-phoenix-otel arize-phoenix anthropic openai google-generativeai groq openinference-instrumentation-anthropic openinference-instrumentation-openai openinference-instrumentation-groq openinference-instrumentation-vertexai

In [None]:
import os
from getpass import getpass
from secrets import token_hex

import anthropic
import google.generativeai as genai
import groq
import nest_asyncio
import openai
import pandas as pd
from google.generativeai.generative_models import GenerativeModel
from openinference.instrumentation.anthropic import AnthropicInstrumentor
from openinference.instrumentation.groq import GroqInstrumentor
from openinference.instrumentation.openai import OpenAIInstrumentor
from openinference.instrumentation.vertexai import VertexAIInstrumentor
from sklearn.metrics import accuracy_score

import phoenix as px
from phoenix.client import Client
from phoenix.client.types import PromptVersion
from phoenix.experiments import run_experiment
from phoenix.otel import register

nest_asyncio.apply()

# Launch Phoenix

In [None]:
px.launch_app()

# LLM Vendor API Keys

Enter a blank value if not available.

In [None]:
if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass("OpenAI API key: ")
if not os.getenv("ANTHROPIC_API_KEY"):
    os.environ["ANTHROPIC_API_KEY"] = getpass("Anthropic API key: ")
if not os.getenv("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = getpass("Google API key: ")
if not os.getenv("GROQ_API_KEY"):
    os.environ["GROQ_API_KEY"] = getpass("Groq API key: ")

# Instrumentation

In [None]:
tracer_provider = register()
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)
AnthropicInstrumentor().instrument(tracer_provider=tracer_provider)
GroqInstrumentor().instrument(tracer_provider=tracer_provider)
VertexAIInstrumentor().instrument(tracer_provider=tracer_provider)

In [None]:
url = "https://raw.githubusercontent.com/RUCAIBox/HaluEval/refs/heads/main/data/qa_data.json"
qa = pd.read_json(url, lines=True)
qa.sample(5).iloc[:, ::-1]

In [None]:
SAMPLE_SIZE = 10

k = qa.iloc[:, :2]
df = pd.concat(
    [
        pd.concat([k, qa.iloc[:, 2].rename("answer")], axis=1).assign(true_label="factual"),
        pd.concat([k, qa.iloc[:, 3].rename("answer")], axis=1).assign(true_label="hallucinated"),
    ]
)
df = df.sample(SAMPLE_SIZE, random_state=42).reset_index(drop=True).iloc[:, ::-1]
df

# Upload Dataset

In [None]:
dataset_name = f"hallu-eval-{token_hex(4)}"  # adding a random suffix for demo purposes

ds = px.Client().upload_dataset(
    dataframe=df,
    dataset_name=dataset_name,
    input_keys=["question", "knowledge", "answer"],
    output_keys=["true_label"],
)

# Create Prompt


In [None]:
prompt_name = f"hallu-eval-{token_hex(4)}"  # adding a random suffix for demo purposes

Send this [prompt](https://github.com/Arize-ai/phoenix/blob/390cfaa42c5b2c28d3f9f83fbf7c694b8c2beeff/packages/phoenix-evals/src/phoenix/evals/default_templates.py#L56) to Phoenix.

In [None]:
content = """\
In this task, you will be presented with a query, a reference text and an answer. The answer is
generated to the question based on the reference text. The answer may contain false information. You
must use the reference text to determine if the answer to the question contains false information,
if the answer is a hallucination of facts. Your objective is to determine whether the answer text
contains factual information and is not a hallucination. A 'hallucination' refers to
an answer that is not based on the reference text or assumes information that is not available in
the reference text. Your response should be a single word: either "factual" or "hallucinated", and
it should not include any other text or characters. "hallucinated" indicates that the answer
provides factually inaccurate information to the query based on the reference text. "factual"
indicates that the answer to the question is correct relative to the reference text, and does not
contain made up information. Please read the query and reference text carefully before determining
your response.

[BEGIN DATA]
************
[Query]: {{ question }}
************
[Reference text]: {{ knowledge }}
************
[Answer]: {{ answer }}
************
[END DATA]

Is the answer above factual or hallucinated based on the query and reference text?
"""
_ = Client().prompts.create(
    name=prompt_name,
    prompt_description="Determining if an answer is factual or hallucinated based on a query and reference text",
    version=PromptVersion(
        [{"role": "user", "content": content}],
        model_name="gpt-4o-mini",
    ),
)

# Get Prompt

In [None]:
prompt = Client().prompts.get(prompt_identifier=prompt_name)

# OpenAI

In [None]:
variables = dict(ds.as_dataframe().input.iloc[0])
formatted_prompt = prompt.format(variables=variables)
response = openai.OpenAI().chat.completions.create(**formatted_prompt)
print(response.choices[0].message.content)

### Run Experiment

In [None]:
def openai_eval(input):
    formatted_prompt = prompt.format(variables=dict(input))
    response = openai.OpenAI().chat.completions.create(**formatted_prompt)
    return {"label": response.choices[0].message.content}

In [None]:
exp_openai = run_experiment(ds, openai_eval)

### Calculate Accuracy

In [None]:
result_openai = pd.concat(
    [df.true_label, pd.json_normalize(exp_openai.as_dataframe().output)], axis=1
)
print(f"Accuracy: {accuracy_score(result_openai.true_label, result_openai.label) * 100:.0f}%")

# Anthropic

In [None]:
anthropic_model = "claude-3-5-haiku-latest"  # @param {type: "string"}

In [None]:
variables = dict(ds.as_dataframe().input.iloc[0])
formatted_prompt = prompt.format(variables=variables, sdk="anthropic")
response = anthropic.Anthropic().messages.create(**{**formatted_prompt, "model": anthropic_model})
print(response.content[0].text)

### Run Experiment

In [None]:
def anthropic_eval(input):
    formatted_prompt = prompt.format(variables=dict(input), sdk="anthropic")
    response = anthropic.Anthropic().messages.create(
        **{**formatted_prompt, "model": anthropic_model}
    )
    return {"label": response.content[0].text}

In [None]:
exp_anthropic = run_experiment(ds, anthropic_eval)

### Calculate Accuracy

In [None]:
result_anthropic = pd.concat(
    [df.true_label, pd.json_normalize(exp_anthropic.as_dataframe().output)], axis=1
)
print(f"Accuracy: {accuracy_score(result_anthropic.true_label, result_anthropic.label) * 100:.0f}%")

# Gemini

In [None]:
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

In [None]:
gemini_model = "gemini-1.5-flash"

In [None]:
variables = dict(ds.as_dataframe().input.iloc[0])
formatted_prompt = prompt.format(variables=variables, sdk="google_generativeai")
response = (
    GenerativeModel(**{**formatted_prompt.kwargs, "model_name": gemini_model})
    .start_chat(history=formatted_prompt.messages[:-1])
    .send_message(formatted_prompt.messages[-1])
)
print(response.candidates[0].content.parts[0].text)

### Run Experiment

In [None]:
def gemini_eval(input):
    formatted_prompt = prompt.format(variables=variables, sdk="google_generativeai")
    response = (
        GenerativeModel(**{**formatted_prompt.kwargs, "model_name": gemini_model})
        .start_chat(history=formatted_prompt.messages[:-1])
        .send_message(formatted_prompt.messages[-1])
    )
    return {"label": response.candidates[0].content.parts[0].text}

In [None]:
exp_gemini = run_experiment(ds, gemini_eval)

### Calculate Accuracy

In [None]:
result_anthropic = pd.concat(
    [df.true_label, pd.json_normalize(exp_anthropic.as_dataframe().output)], axis=1
)
print(f"Accuracy: {accuracy_score(result_anthropic.true_label, result_anthropic.label) * 100:.0f}%")

# Groq

In [None]:
groq_model = "deepseek-r1-distill-llama-70b"

In [None]:
variables = dict(ds.as_dataframe().input.iloc[0])
formatted_prompt = prompt.format(variables=variables)
response = await groq.AsyncGroq().chat.completions.create(
    **{**formatted_prompt, "model": groq_model}
)
print(response.choices[0].message.content)

### Run Experiment

In [None]:
async def groq_eval(input):
    formatted_prompt = prompt.format(variables=dict(input))
    response = await groq.AsyncGroq().chat.completions.create(
        **{**formatted_prompt, "model": groq_model}
    )
    return {"label": response.choices[0].message.content}

In [None]:
exp_groq = run_experiment(ds, groq_eval)

### Extract the Last Word to Calculate Accuracy

In [None]:
labels = pd.json_normalize(exp_groq.as_dataframe().output).label.str.split("\n").str[-1]
result = pd.concat([labels, df.true_label], axis=1)
print(f"Accuracy: {accuracy_score(result.true_label, result.label) * 100:.0f}%")
result

Compare answers between GPT and DeepSeek

In [None]:
pd.concat([result_openai.label.rename("gpt"), result.rename({"label": "deepseek"}, axis=1)], axis=1)