In [None]:
from phoenix.client import Client
from phoenix.client.types.spans import SpanQuery
import pandas as pd
pd.set_option('display.max_columns', 500)


# Define the project identifier
project_identifier = "calculus-code-agent"

# Export all the top level spans 
spansdf = Client().spans.get_spans_dataframe( project_identifier=project_identifier)

spansdf.head()

In [None]:
# Extract the question and generated code from the spans
query = SpanQuery().where("name == 'write_calculus_code'")
spansdf = Client().spans.get_spans_dataframe(query=query, project_identifier=project_identifier)

# drop any traces not from our dataset
spansdf.dropna(
    subset=["attributes.metadata"], inplace=True
)

spansdf["question"] = spansdf["attributes.input.value"] 
spansdf["code"] = spansdf["attributes.output.value"]

# Now get the code execution result spans
query = SpanQuery().where("name == 'execute_python_code'")
code_res_df = Client().spans.get_spans_dataframe(
    query=query, project_identifier=project_identifier
)

code_res_df["result"] = code_res_df["attributes.output.value"]

# Now get the answer generation spans
query = SpanQuery().where("name == 'generate_answer'")
gen_df = Client().spans.get_spans_dataframe(
    query=query, project_identifier=project_identifier
)

gen_df["answer"] = gen_df["attributes.output.value"]


# Combine the spans together 
spans_df = spansdf.merge(
    code_res_df[["context.trace_id", "result",]],
    on="context.trace_id",
    how="left",
)

spans_df = spans_df.merge(
    gen_df[["context.trace_id", "answer",]],
    on="context.trace_id",
    how="left",
)

spans_df.head(2)

In [None]:
from phoenix.evals import (
    HALLUCINATION_PROMPT_RAILS_MAP,
    GoogleGenAIModel,
    download_benchmark_dataset,
    llm_classify,
)
import nest_asyncio

nest_asyncio.apply()

# API key will be read from environment
model = GoogleGenAIModel(model="gemini-2.0-flash-001", default_concurrency=1, initial_rate_limit=0.1)

In [None]:
HALLUCINATION_PROMPT_TEMPLATE = """
In this task, you will be given a query, code generated to address the query, 
the execution result of that code, and an answer produced from them. 
Your job is to check if the answer is faithful to the provided code and result.

- "factual" means the answer accurately reflects the information in the code and result.  
- "hallucinated" means the answer introduces details not supported by the code and result, 
  or contradicts them.  

Output exactly one word: either "factual" or "hallucinated". Do not output anything else.

[BEGIN DATA]
************
[Question]: {question}
************
[Generated code]: {code}
************
[Generated result]: {result}
************
[Answer]: {answer}
************
[END DATA]
"""


In [None]:
rails = list(HALLUCINATION_PROMPT_RAILS_MAP.values())
hallucination_classifications = llm_classify(
    data=spans_df, 
    template=HALLUCINATION_PROMPT_TEMPLATE, 
    model=model, 
    rails=rails,
    provide_explanation=True, #optional to generate explanations for the value produced by the eval LLM
)

In [None]:
hallucination_classifications.head()

## Checking for Code Readability

In [None]:
from phoenix.evals import CODE_READABILITY_PROMPT_RAILS_MAP, TOOL_CALLING_PROMPT_RAILS_MAP

In [None]:
CODE_READABILITY_PROMPT_TEMPLATE = """
    You will be given a question and a piece of code. 
    Your task is to judge whether the code is easy to read and understand.

    - "readable" means the code is clear, structured, and understandable.  
    - "unreadable" means the code is messy, confusing, or difficult to follow.  

    Output exactly one word: either "readable" or "unreadable". Do not output anything else.

    [BEGIN DATA]
    ************
    [Question]: {question}
    ************
    [Code]: {code}
    ************
    [END DATA]
"""


In [None]:
code_rails = list(CODE_READABILITY_PROMPT_RAILS_MAP.values())
readability_classifications = llm_classify(
    data=spans_df, 
    template=CODE_READABILITY_PROMPT_TEMPLATE, 
    model=model, 
    rails=code_rails,
    provide_explanation=True, #optional to generate explanations for the value produced by the eval LLM
)

In [None]:
readability_classifications.head()

## Tool Calling

In [None]:
TOOL_CALLING_PROMPT_TEMPLATE = """
You will be given a question, code executed by the tool, and the result of that execution. 
Your task is to decide whether the tool call (code + result) correctly answers the question.

- "correct" means the code was appropriate for the question, the execution result is consistent with it, 
  and no extra information outside the question was introduced.  
- "incorrect" means the code does not match the question, produces a result unrelated to the question, 
  or introduces information not present in the question.  

Output exactly one word: either "correct" or "incorrect". Do not output anything else.

[BEGIN DATA]
************
[Question]: {question}
************
[Code Executed]: {code}
************
[Result]: {result}
************
[END DATA]
"""


In [None]:
tool_rails = list(TOOL_CALLING_PROMPT_RAILS_MAP.values())

tool_call_evaluations = llm_classify(
    data=spans_df, 
    template=TOOL_CALLING_PROMPT_TEMPLATE, 
    model=model, 
    rails=tool_rails,
    provide_explanation=True, #optional to generate explanations for the value produced by the eval LLM
)

In [None]:
tool_call_evaluations.head()


## Logging the Evaluations

In [None]:
Client().spans.log_span_annotations_dataframe(
    dataframe=hallucination_classifications,
    annotation_name="Hallucination",
    annotator_kind="LLM",
)
Client().spans.log_document_annotations_dataframe(
    dataframe=readability_classifications,
    annotation_name="Code Readability",
    annotator_kind="LLM",
)
Client().spans.log_span_annotations_dataframe(
    dataframe=tool_call_evaluations,
    annotation_name="Tool Call Correctness",
    annotator_kind="LLM",
)