<center>
    <p style="text-align:center">
        <img alt="phoenix logo" src="https://storage.googleapis.com/arize-assets/phoenix/assets/phoenix-logo-light.svg" width="200"/>
        <br>
        <a href="https://docs.arize.com/phoenix/">Docs</a>
        |
        <a href="https://github.com/Arize-ai/phoenix">GitHub</a>
        |
        <a href="https://join.slack.com/t/arize-ai/shared_invite/zt-1px8dcmlf-fmThhDFD_V_48oU7ALan4Q">Community</a>
    </p>
</center>
<h1 align="center">Evaluating an Agent</h1>



## Install Dependencies, Import Libraries, Set API Keys

In [1]:
!pip install -q openai arize-phoenix openinference-instrumentation-openai python-dotenv duckdb "openinference-instrumentation>=0.1.21"

In [1]:
import dotenv

dotenv.load_dotenv()

from openai import OpenAI
from phoenix.otel import register
from opentelemetry.trace import StatusCode
from openinference.instrumentation.openai import OpenAIInstrumentor
import duckdb
import json
import os
from getpass import getpass
import pandas as pd
from pydantic import BaseModel, Field
from IPython.display import Markdown
from tqdm import tqdm

import json
from typing import Any, Dict

from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry.trace import Status, StatusCode, set_tracer_provider

from openinference.instrumentation import (
    TracerProvider,
    suppress_tracing,
)
from openinference.semconv.resource import ResourceAttributes
import phoenix as px


# Enable Phoenix Tracing

In [2]:
if os.getenv("OPENAI_API_KEY") is None:
    os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")

client = OpenAI()
model = "gpt-4o-mini"
project_name = "github-1-15-2025-live"

In [3]:

# This will be replaced with the Phoenix register() call
endpoint = "http://127.0.0.1:6006/v1/traces"
resource = Resource(attributes={ResourceAttributes.PROJECT_NAME: project_name})
tracer_provider = TracerProvider(resource=resource)
tracer_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint)))
tracer = tracer_provider.get_tracer(__name__)

session = px.launch_app()

# Initialize OpenAI instrumentation
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


## Prepare dataset

In [4]:
store_sales_df = pd.read_parquet("https://storage.googleapis.com/arize-phoenix-assets/datasets/unstructured/llm/llama-index/Store_Sales_Price_Elasticity_Promotions_Data.parquet")
store_sales_df.head()

Unnamed: 0,Store_Number,SKU_Coded,Product_Class_Code,Sold_Date,Qty_Sold,Total_Sale_Value,On_Promo
0,1320,6172800,22875,2021-11-02,3,56.849998,0
1,2310,6172800,22875,2021-11-03,1,18.950001,0
2,3080,6172800,22875,2021-11-03,1,18.950001,0
3,2310,6172800,22875,2021-11-06,1,18.950001,0
4,4840,6172800,22875,2021-11-07,1,18.950001,0


## Define the tools

### Tool 1: Database Lookup

In [8]:
SQL_GENERATION_PROMPT = """
Generate an SQL query based on a prompt. Do not reply with anything besides the SQL query.
The prompt is: {prompt}

The available columns are: {columns}
The table name is: {table_name}
"""

def generate_sql_query(prompt: str, columns: list, table_name: str) -> str:
    """Generate an SQL query based on a prompt"""
    formatted_prompt = SQL_GENERATION_PROMPT.format(prompt=prompt, columns=columns, table_name=table_name)

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": formatted_prompt}],
    )
    
    return response.choices[0].message.content

@tracer.tool()
def lookup_sales_data(prompt: str) -> str:
    """Implementation of sales data lookup from parquet file using SQL"""
    try:
        table_name = "sales"
        # Read the parquet file into a DuckDB table
        duckdb.sql(f"CREATE TABLE IF NOT EXISTS {table_name} AS SELECT * FROM store_sales_df")
        
        print(store_sales_df.columns)
        print(table_name)
        sql_query = generate_sql_query(prompt, store_sales_df.columns, table_name)
        sql_query = sql_query.strip()
        sql_query = sql_query.replace("```sql", "").replace("```", "")
        
        with tracer.start_as_current_span("execute_sql_query", openinference_span_kind="chain") as span:
            span.set_input(value=sql_query)
            
            # Execute the SQL query
            result = duckdb.sql(sql_query).df()
            span.set_output(value=str(result))
            span.set_status(StatusCode.OK)
        return result.to_string()
    except Exception as e:
        return f"Error accessing data: {str(e)}"

In [9]:
example_data = lookup_sales_data("Show me all the sales for store 1320 on November 1st, 2021")
example_data

Index(['Store_Number', 'SKU_Coded', 'Product_Class_Code', 'Sold_Date',
       'Qty_Sold', 'Total_Sale_Value', 'On_Promo'],
      dtype='object')
sales


'    Store_Number  SKU_Coded  Product_Class_Code  Sold_Date  Qty_Sold  Total_Sale_Value  On_Promo\n0           1320    6173050               22875 2021-11-01         1          4.990000         0\n1           1320    6174250               22875 2021-11-01         1          0.890000         0\n2           1320    6176200               22975 2021-11-01         2         99.980003         0\n3           1320    6176800               22800 2021-11-01         1         14.970000         0\n4           1320    6177250               22975 2021-11-01         1          6.890000         0\n5           1320    6177300               22800 2021-11-01         1          9.990000         0\n6           1320    6177350               22800 2021-11-01         2         16.980000         0\n7           1320    6177700               22875 2021-11-01         1          3.190000         0\n8           1320    6178000               22875 2021-11-01         2          6.380000         0\n9           1320   

### Tool 2: Data Visualization

In [8]:
class VisualizationConfig(BaseModel):
    chart_type: str = Field(..., description="Type of chart to generate")
    x_axis: str = Field(..., description="Name of the x-axis column")
    y_axis: str = Field(..., description="Name of the y-axis column")
    title: str = Field(..., description="Title of the chart")

@tracer.chain()
def extract_chart_config(data: str, visualization_goal: str) -> dict:
    """Generate chart visualization configuration
    
    Args:
        data: String containing the data to visualize
        visualization_goal: Description of what the visualization should show
        
    Returns:
        Dictionary containing line chart configuration
    """
    prompt = f"""Generate a chart configuration based on this data: {data}
    The goal is to show: {visualization_goal}"""
    
    response = client.beta.chat.completions.parse(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        response_format=VisualizationConfig,
    )
    
    try:
        # Extract axis and title info from response
        content = response.choices[0].message.content
        
        # Return structured chart config
        return {
            "chart_type": content.chart_type,
            "x_axis": content.x_axis,
            "y_axis": content.y_axis,
            "title": content.title,
            "data": data
        }
    except Exception:
        return {
            "chart_type": "line", 
            "x_axis": "date",
            "y_axis": "value",
            "title": visualization_goal,
            "data": data
        }
        
@tracer.chain()
def create_chart(config: VisualizationConfig) -> str:
    """Create a chart based on the configuration"""
    prompt = f"""Write python code to create a chart based on the following configuration.
    Only return the code, no other text.
    config: {config}"""
    
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
    )
    
    code = response.choices[0].message.content
    code = code.replace("```python", "").replace("```", "")
    code = code.strip()
    
    return code

@tracer.tool()
def generate_visualization(data: str, visualization_goal: str) -> str:
    """Generate a visualization based on the data and goal"""
    config = extract_chart_config(data, visualization_goal)
    code = create_chart(config)
    return code

In [9]:
# code = generate_visualization(example_data, "A line chart of sales over each day in november.")

In [10]:
@tracer.tool()
def run_python_code(code: str) -> str:
    """Execute Python code in a restricted environment"""
    # Create restricted globals/locals dictionaries with plotting libraries
    restricted_globals = {
        '__builtins__': {
            'print': print,
            'len': len,
            'range': range,
            'sum': sum,
            'min': min,
            'max': max,
            'int': int, 
            'float': float,
            'str': str,
            'list': list,
            'dict': dict,
            'tuple': tuple,
            'set': set,
            'round': round,
            '__import__': __import__,
            'json': __import__('json')
        },
        'plt': __import__('matplotlib.pyplot'),
        'pd': __import__('pandas'),
        'np': __import__('numpy'),
        'sns': __import__('seaborn')
    }
    
    try:
        # Execute code in restricted environment
        exec_locals = {}
        exec(code, restricted_globals, exec_locals)
        
        # Capture any printed output or return the plot
        output = exec_locals.get('__builtins__', {}).get('_', '')
        if 'plt' in exec_locals:
            return exec_locals['plt']
        
        # Try to parse output as JSON before returning
        return "Code executed successfully"
        
    except Exception as e:
        return f"Error executing code: {str(e)}"

### Tool 3: Data Analysis

In [11]:
@tracer.tool()
def analyze_sales_data(prompt: str, data: str) -> str:
    """Implementation of AI-powered sales data analysis"""
    # Construct prompt based on analysis type and data subset
    prompt = f"""Analyze the following data: {data}
    Your job is to answer the following question: {prompt}"""
    
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
    )
    
    analysis = response.choices[0].message.content
    return analysis if analysis else "No analysis could be generated"

In [12]:
# analysis = analyze_sales_data("What is the most popular product SKU?", example_data)
# analysis


### Tool Schema:

In [71]:
# Define tools/functions that can be called by the model
tools = [
    {
        "type": "function",
        "function": {
            "name": "lookup_sales_data",
            "description": "Look up data from Store Sales Price Elasticity Promotions dataset",
            "parameters": {
                "type": "object",
                "properties": {
                    "prompt": {"type": "string", "description": "The unchanged prompt that the user provided."}
                },
                "required": ["prompt"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "analyze_sales_data", 
            "description": "Analyze sales data to extract insights",
            "parameters": {
                "type": "object",
                "properties": {
                    "data": {"type": "string", "description": "The lookup_sales_data tool's output."},
                    "prompt": {"type": "string", "description": "The unchanged prompt that the user provided."}
                },
                "required": ["data", "prompt"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "generate_visualization",
            "description": "Generate Python code to create data visualizations",
            "parameters": {
                "type": "object", 
                "properties": {
                    "data": {"type": "string", "description": "The lookup_sales_data tool's output."},
                    "visualization_goal": {"type": "string", "description": "The goal of the visualization."}
                },
                "required": ["data", "visualization_goal"]
            }
        }
    },
    # {
    #     "type": "function",
    #     "function": {
    #         "name": "run_python_code",
    #         "description": "Run Python code in a restricted environment",
    #         "parameters": {
    #             "type": "object",
    #             "properties": {
    #                 "code": {"type": "string", "description": "The Python code to run."}
    #             },
    #             "required": ["code"]
    #         }
    #     }
    # }
]

# Dictionary mapping function names to their implementations
tool_implementations = {
    "lookup_sales_data": lookup_sales_data,
    "analyze_sales_data": analyze_sales_data, 
    "generate_visualization": generate_visualization,
    # "run_python_code": run_python_code
}


## Agent logic

In [14]:
@tracer.chain()
def handle_tool_calls(tool_calls, messages):
    for tool_call in tool_calls:
        function = tool_implementations[tool_call.function.name]
        function_args = json.loads(tool_call.function.arguments)
        result = function(**function_args)
                    
        messages.append({"role": "tool", "content": result, "tool_call_id": tool_call.id})
    return messages


In [15]:
def start_main_span(messages):
    print("Starting main span with messages:", messages)
    
    
    with tracer.start_as_current_span("AgentRun", openinference_span_kind="agent") as span:
        span.set_input(value=messages)
        ret = run_agent(messages)
        print("Main span completed with return value:", ret)
        span.set_output(value=ret)
        span.set_status(StatusCode.OK)
        return ret

def run_agent(messages):
    print("Running agent with messages:", messages)
    if isinstance(messages, str):
        messages = [{"role": "user", "content": messages}]
        print("Converted string message to list format")
    
    # Check and add system prompt if needed
    if not any(
            isinstance(message, dict) and message.get("role") == "system" for message in messages
        ):
            system_prompt = {"role": "system", "content": "You are a helpful assistant that can answer questions about the Store Sales Price Elasticity Promotions dataset."}
            messages.append(system_prompt)
            print("Added system prompt to messages")

    while True:
        # Router call span
        print("Starting router call span")
        with tracer.start_as_current_span(
            "router_call",
            openinference_span_kind="chain",
        ) as span:
            span.set_input(value=messages)
            
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                tools=tools,
            )

            messages.append(response.choices[0].message.model_dump())
            tool_calls = response.choices[0].message.tool_calls
            print("Received response with tool calls:", bool(tool_calls))
            span.set_status(StatusCode.OK)
            
            if tool_calls:
                # Tool calls span
                print("Processing tool calls")
                messages = handle_tool_calls(tool_calls, messages)
                span.set_output(value=tool_calls)
            else:
                print("No tool calls, returning final response")
                span.set_output(value=response.choices[0].message.content)

                return response.choices[0].message.content


## Run the agent

In [84]:
ret = start_main_span([{"role": "user", "content": "Create a line chart showing sales in 2021"}])
print(Markdown(ret))

Starting main span with messages: [{'role': 'user', 'content': 'Create a line chart showing sales in 2021'}]
Running agent with messages: [{'role': 'user', 'content': 'Create a line chart showing sales in 2021'}]
Added system prompt to messages
Starting router call span
Received response with tool calls: True
Processing tool calls
Starting router call span
Received response with tool calls: True
Processing tool calls
Starting router call span
Received response with tool calls: False
No tool calls, returning final response
Main span completed with return value: I have created the line chart showing the total sales value in 2021 based on the available data for November and December. The chart visualizes the sales performance during these two months. If you need any modifications or additional insights, let me know!
<IPython.core.display.Markdown object>


In [27]:
agent_questions = [
    "What was the most popular product SKU?",
    "What was the total revenue across all stores?",
    "Which store had the highest sales volume?",
    "Create a bar chart showing total sales by store",
    "What percentage of items were sold on promotion?",
    "Plot daily sales volume over time", 
    "What was the average transaction value?",
    "Create a box plot of transaction values",
    "Which products were frequently purchased together?",
    "Plot a line graph showing the sales trend over time with a 7-day moving average"
]

for question in tqdm(agent_questions, desc="Processing questions"):
    try:
        ret = start_main_span([{"role": "user", "content": question}])
    except Exception as e:
        print(f"Error processing question: {question}")
        print(e)
        continue

I0000 00:00:1737590793.902275 17274972 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers
Processing questions:   0%|          | 0/10 [00:00<?, ?it/s]

Starting main span with messages: [{'role': 'user', 'content': 'What was the most popular product SKU?'}]
Running agent with messages: [{'role': 'user', 'content': 'What was the most popular product SKU?'}]
Added system prompt to messages
Starting router call span
Received response with tool calls: True
Processing tool calls
Starting router call span


Processing questions:  10%|█         | 1/10 [00:02<00:25,  2.83s/it]

Received response with tool calls: False
No tool calls, returning final response
Main span completed with return value: The most popular product SKU was 6200700, with a total quantity sold of 52,262 units.
Starting main span with messages: [{'role': 'user', 'content': 'What was the total revenue across all stores?'}]
Running agent with messages: [{'role': 'user', 'content': 'What was the total revenue across all stores?'}]
Added system prompt to messages
Starting router call span
Received response with tool calls: True
Processing tool calls
Starting router call span


Processing questions:  20%|██        | 2/10 [00:05<00:19,  2.45s/it]

Received response with tool calls: False
No tool calls, returning final response
Main span completed with return value: The total revenue across all stores was approximately $13,272,640.
Starting main span with messages: [{'role': 'user', 'content': 'Which store had the highest sales volume?'}]
Running agent with messages: [{'role': 'user', 'content': 'Which store had the highest sales volume?'}]
Added system prompt to messages
Starting router call span
Received response with tool calls: True
Processing tool calls
Starting router call span


Processing questions:  30%|███       | 3/10 [00:14<00:39,  5.58s/it]

Received response with tool calls: False
No tool calls, returning final response
Main span completed with return value: The store with the highest sales volume is Store Number 2970, with a total sales volume of 59,322.0.
Starting main span with messages: [{'role': 'user', 'content': 'Create a bar chart showing total sales by store'}]
Running agent with messages: [{'role': 'user', 'content': 'Create a bar chart showing total sales by store'}]
Added system prompt to messages
Starting router call span
Received response with tool calls: True
Processing tool calls
Starting router call span
Received response with tool calls: True
Processing tool calls
Starting router call span


Processing questions:  40%|████      | 4/10 [00:43<01:28, 14.79s/it]

Received response with tool calls: True
Processing tool calls
Error processing question: Create a bar chart showing total sales by store
'run_python_code'
Starting main span with messages: [{'role': 'user', 'content': 'What percentage of items were sold on promotion?'}]
Running agent with messages: [{'role': 'user', 'content': 'What percentage of items were sold on promotion?'}]
Added system prompt to messages
Starting router call span
Received response with tool calls: True
Processing tool calls
Starting router call span


Processing questions:  50%|█████     | 5/10 [00:45<00:51, 10.32s/it]

Received response with tool calls: False
No tool calls, returning final response
Main span completed with return value: Approximately 62.56% of items were sold on promotion.
Starting main span with messages: [{'role': 'user', 'content': 'Plot daily sales volume over time'}]
Running agent with messages: [{'role': 'user', 'content': 'Plot daily sales volume over time'}]
Added system prompt to messages
Starting router call span
Received response with tool calls: True
Processing tool calls
Starting router call span
Received response with tool calls: True
Processing tool calls
Starting router call span


Processing questions:  60%|██████    | 6/10 [09:15<12:00, 180.12s/it]

Received response with tool calls: True
Processing tool calls
Error processing question: Plot daily sales volume over time
'run_python_code'
Starting main span with messages: [{'role': 'user', 'content': 'What was the average transaction value?'}]
Running agent with messages: [{'role': 'user', 'content': 'What was the average transaction value?'}]
Added system prompt to messages
Starting router call span
Received response with tool calls: True
Processing tool calls
Starting router call span


Processing questions:  70%|███████   | 7/10 [09:17<06:06, 122.02s/it]

Received response with tool calls: False
No tool calls, returning final response
Main span completed with return value: The average transaction value was approximately **$19.02**.
Starting main span with messages: [{'role': 'user', 'content': 'Create a box plot of transaction values'}]
Running agent with messages: [{'role': 'user', 'content': 'Create a box plot of transaction values'}]
Added system prompt to messages
Starting router call span
Received response with tool calls: True
Processing tool calls
Starting router call span


Processing questions:  80%|████████  | 8/10 [09:28<02:53, 86.63s/it] 

Error processing question: Create a box plot of transaction values
Error code: 400 - {'error': {'message': "Invalid 'messages[3].content': string too long. Expected a string with maximum length 1048576, but got a string with length 17447374 instead.", 'type': 'invalid_request_error', 'param': 'messages[3].content', 'code': 'string_above_max_length'}}
Starting main span with messages: [{'role': 'user', 'content': 'Which products were frequently purchased together?'}]
Running agent with messages: [{'role': 'user', 'content': 'Which products were frequently purchased together?'}]
Added system prompt to messages
Starting router call span
Received response with tool calls: True
Processing tool calls
Starting router call span


Processing questions:  90%|█████████ | 9/10 [09:36<01:01, 61.99s/it]

Error processing question: Which products were frequently purchased together?
Error code: 400 - {'error': {'message': "Invalid 'messages[3].content': string too long. Expected a string with maximum length 1048576, but got a string with length 11902599 instead.", 'type': 'invalid_request_error', 'param': 'messages[3].content', 'code': 'string_above_max_length'}}
Starting main span with messages: [{'role': 'user', 'content': 'Plot a line graph showing the sales trend over time with a 7-day moving average'}]
Running agent with messages: [{'role': 'user', 'content': 'Plot a line graph showing the sales trend over time with a 7-day moving average'}]
Added system prompt to messages
Starting router call span
Received response with tool calls: True
Processing tool calls
Starting router call span


Processing questions: 100%|██████████| 10/10 [10:08<00:00, 60.90s/it]

Error processing question: Plot a line graph showing the sales trend over time with a 7-day moving average
Error code: 400 - {'error': {'message': "Invalid 'messages[3].content': string too long. Expected a string with maximum length 1048576, but got a string with length 70487394 instead.", 'type': 'invalid_request_error', 'param': 'messages[3].content', 'code': 'string_above_max_length'}}





# Evaluating the agent

In [17]:
OpenAIInstrumentor().uninstrument()

In [16]:
import phoenix as px
from phoenix.evals import (
    TOOL_CALLING_PROMPT_TEMPLATE, 
    llm_classify,
    OpenAIModel
)
from phoenix.trace import SpanEvaluations
from phoenix.experiments import run_experiment, evaluate_experiment
from phoenix.trace.dsl import SpanQuery
from phoenix.experiments.types import Example
from phoenix.experiments.evaluators import create_evaluator

import nest_asyncio
nest_asyncio.apply()

In [17]:
px_client = px.Client()
eval_model = OpenAIModel(model="gpt-4o-mini")

## Function Calling Evals using LLM as a Judge

In [22]:
query = SpanQuery().where(
    # Filter for the `RETRIEVER` span kind.
    # The filter condition is a string of valid Python boolean expression.
    "span_kind == 'LLM'",
).select(
    # Extract the span attribute `input.value` which contains the query for the
    # retriever. Rename it as the `input` column in the output dataframe.
    question="input.value",
    output_messages="llm.tools"
    
)

# The Phoenix Client can take this query and return the dataframe.
tool_calls_df = px.Client().query_spans(query, project_name=project_name, timeout=None)
tool_calls_df = tool_calls_df.dropna(subset=["output_messages"])
tool_calls_df["tool_call"] = tool_calls_df["output_messages"]

tool_calls_df.head()

Unnamed: 0_level_0,question,output_messages,tool_call
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
d8d048cfadc1b8fb,"{""messages"": [{""role"": ""user"", ""content"": ""Whi...","[{'tool': {'json_schema': '{""type"": ""function""...","[{'tool': {'json_schema': '{""type"": ""function""..."
5eeb77eb2c180656,"{""messages"": [{""role"": ""user"", ""content"": ""Whi...","[{'tool': {'json_schema': '{""type"": ""function""...","[{'tool': {'json_schema': '{""type"": ""function""..."
243ffb56898e032c,"{""messages"": [{""role"": ""user"", ""content"": ""Wha...","[{'tool': {'json_schema': '{""type"": ""function""...","[{'tool': {'json_schema': '{""type"": ""function""..."


In [23]:
tool_call_eval = llm_classify(
    dataframe = tool_calls_df,
    template = TOOL_CALLING_PROMPT_TEMPLATE.template.replace("{tool_definitions}", "generate_visualization, lookup_sales_data, analyze_sales_data, run_python_code"),
    rails = ['correct', 'incorrect'],
    model=eval_model,
    provide_explanation=True
)

tool_call_eval['score'] = tool_call_eval.apply(lambda x: 1 if x['label']=='correct' else 0, axis=1)

tool_call_eval.head()

llm_classify |          | 0/3 (0.0%) | ⏳ 00:00<? | ?it/s

Unnamed: 0_level_0,label,explanation,exceptions,execution_status,execution_seconds,score
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
d8d048cfadc1b8fb,correct,The tools called include 'lookup_sales_data' t...,[],COMPLETED,1.931483,1
5eeb77eb2c180656,correct,The tool call to 'lookup_sales_data' is approp...,[],COMPLETED,2.468532,1
243ffb56898e032c,incorrect,The chosen tools include 'lookup_sales_data' t...,[],COMPLETED,2.350198,0


In [24]:
px.Client().log_evaluations(
    SpanEvaluations(eval_name="Tool Calling Eval", dataframe=tool_call_eval),
)

### Function Calling Evals using Ground Truth

In [25]:
agent_tool_responses = {
    "What was the most popular product SKU?": "lookup_sales_data, analyze_sales_data",
    "What was the total revenue across all stores?": "lookup_sales_data, analyze_sales_data",
    "Which store had the highest sales volume?": "lookup_sales_data, analyze_sales_data",
    "Create a bar chart showing total sales by store": "generate_visualization, lookup_sales_data, run_python_code",
    "What percentage of items were sold on promotion?": "lookup_sales_data, analyze_sales_data",
    "Plot daily sales volume over time": "generate_visualization, lookup_sales_data, run_python_code", 
    "What was the average transaction value?": "lookup_sales_data, analyze_sales_data",
    "Create a box plot of transaction values": "generate_visualization, lookup_sales_data, run_python_code",
    "Which products were frequently purchased together?": "lookup_sales_data, analyze_sales_data",
    "Plot a line graph showing the sales trend over time with a 7-day moving average": "generate_visualization, lookup_sales_data, run_python_code"
}


tool_calling_df = pd.DataFrame(agent_tool_responses.items(), columns=["question", "tool_calls"])
dataset = px_client.upload_dataset(dataframe=tool_calling_df, dataset_name="tool_calling_ground_truth", input_keys=["question"], output_keys=["tool_calls"])

📤 Uploading dataset...
💾 Examples uploaded: http://localhost:6006/datasets/RGF0YXNldDox/examples
🗄️ Dataset version ID: RGF0YXNldFZlcnNpb246MQ==


In [26]:
def run_router_step(example: Example) -> str:
    messages = [{"role": "system", "content": "You are a helpful assistant that can answer questions about the Store Sales Price Elasticity Promotions dataset."}]
    messages.append({"role": "user", "content": example.input.get("question")})
    
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        tools=tools,
    )
    tool_calls = []
    for tool_call in response.choices[0].message.tool_calls:
        tool_calls.append(tool_call.function.name)
    return tool_calls


In [27]:
def tools_match(expected: str, output: str) -> bool:
    expected_tools = expected.get("tool_calls").split(", ")
    return expected_tools == output


In [28]:
experiment = run_experiment(dataset,
                            run_router_step,
                            evaluators=[tools_match],
                            experiment_name="Tool Calling Eval",
                            experiment_description="Evaluating the tool calling step of the agent")


🧪 Experiment started.
📺 View dataset experiments: http://localhost:6006/datasets/RGF0YXNldDox/experiments
🔗 View this experiment: http://localhost:6006/datasets/RGF0YXNldDox/compare?experimentId=RXhwZXJpbWVudDox


running tasks |          | 0/10 (0.0%) | ⏳ 00:00<? | ?it/s

✅ Task runs completed.
🧠 Evaluation started.


running experiment evaluations |          | 0/10 (0.0%) | ⏳ 00:00<? | ?it/s


🔗 View this experiment: http://localhost:6006/datasets/RGF0YXNldDox/compare?experimentId=RXhwZXJpbWVudDox

Experiment Summary (01/16/25 06:00 PM -0800)
--------------------------------------------
| evaluator   |   n |   n_scores |   avg_score |   n_labels | top_2_labels   |
|:------------|----:|-----------:|------------:|-----------:|:---------------|
| tools_match |  10 |         10 |           0 |         10 | {'False': 10}  |

Tasks Summary (01/16/25 06:00 PM -0800)
---------------------------------------
|   n_examples |   n_runs |   n_errors |
|-------------:|---------:|-----------:|
|           10 |       10 |          0 |


## Tool Evals

### Evaluating our SQL generation tool

In [31]:
# This step will be replaced by a human annotated set of ground truth data, instead of generated examples

db_lookup_questions = [
    "What was the most popular product SKU?",
    "Which store had the highest total sales value?", 
    "How many items were sold on promotion?",
    "What was the average quantity sold per transaction?",
    "Which product class code generated the most revenue?",
    "What day of the week had the highest sales volume?",
    "How many unique stores made sales?",
    "What was the highest single transaction value?",
    "Which products were frequently sold together?",
    "What's the trend in sales over time?"
]

expected_results = []

for question in tqdm(db_lookup_questions, desc="Processing SQL lookup questions"):
    try:
        with suppress_tracing():
            expected_results.append(lookup_sales_data(question))
    except Exception as e:
        print(f"Error processing question: {question}")
        print(e)
        db_lookup_questions.remove(question)

# Create a DataFrame with the questions
questions_df = pd.DataFrame({
    'question': db_lookup_questions,
    'expected_result': expected_results
})

display(questions_df)


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Processing SQL lookup questions: 100%|██████████| 10/10 [00:12<00:00,  1.28s/it]


Unnamed: 0,question,expected_result
0,What was the most popular product SKU?,SKU_Coded Total_Quantity_Sold\n0 620070...
1,Which store had the highest total sales value?,Store_Number Total_Sales_Value\n0 ...
2,How many items were sold on promotion?,Total_Items_Sold_On_Promotion\n0 ...
3,What was the average quantity sold per transac...,Average_Quantity_Sold_Per_Transaction\n0 ...
4,Which product class code generated the most re...,Product_Class_Code Total_Revenue\n0 ...
5,What day of the week had the highest sales vol...,Day_Of_Week Total_Sales_Volume\n0 ...
6,How many unique stores made sales?,unique_stores\n0 35
7,What was the highest single transaction value?,Highest_Single_Transaction_Value\n0 ...
8,Which products were frequently sold together?,Product_A Product_B Frequency\n0 ...
9,What's the trend in sales over time?,Sales_Month Total_Quantity_Sold Total_Sal...


In [32]:
dataset = px_client.upload_dataset(dataframe=questions_df, dataset_name="sales_db_lookup_questions", input_keys=["question"], output_keys=["expected_result"])

📤 Uploading dataset...
💾 Examples uploaded: http://localhost:6006/datasets/RGF0YXNldDoy/examples
🗄️ Dataset version ID: RGF0YXNldFZlcnNpb246Mg==


In [33]:
def run_sql_query(example: Example) -> str:
    with suppress_tracing():
        return lookup_sales_data(example.input.get("question"))

In [34]:
def evaluate_sql_result(output: str, expected: str) -> bool:    
    # Extract just the numbers from both strings
    result_nums = ''.join(filter(str.isdigit, output))
    expected_nums = ''.join(filter(str.isdigit, expected.get("expected_result")))
    return result_nums == expected_nums

In [35]:
experiment = run_experiment(dataset,
                            run_sql_query,
                            evaluators=[evaluate_sql_result],
                            experiment_name="SQL Query Eval",
                            experiment_description="Evaluating the SQL query generation step of the agent")

🧪 Experiment started.
📺 View dataset experiments: http://localhost:6006/datasets/RGF0YXNldDoy/experiments
🔗 View this experiment: http://localhost:6006/datasets/RGF0YXNldDoy/compare?experimentId=RXhwZXJpbWVudDoy


running tasks |          | 0/10 (0.0%) | ⏳ 00:00<? | ?it/s

✅ Task runs completed.
🧠 Evaluation started.


running experiment evaluations |          | 0/10 (0.0%) | ⏳ 00:00<? | ?it/s


🔗 View this experiment: http://localhost:6006/datasets/RGF0YXNldDoy/compare?experimentId=RXhwZXJpbWVudDoy

Experiment Summary (01/16/25 06:02 PM -0800)
--------------------------------------------
| evaluator           |   n |   n_scores |   avg_score |   n_labels | top_2_labels            |
|:--------------------|----:|-----------:|------------:|-----------:|:------------------------|
| evaluate_sql_result |  10 |         10 |         0.8 |         10 | {'True': 8, 'False': 2} |

Tasks Summary (01/16/25 06:02 PM -0800)
---------------------------------------
|   n_examples |   n_runs |   n_errors |
|-------------:|---------:|-----------:|
|           10 |       10 |          0 |


### Evaluating our Python code generation tool

In [36]:
# Replace this with a human annotated set of ground truth data, instead of generated examples

code_generation_questions = [
    "Create a bar chart showing total sales by store",
    "Plot daily sales volume over time", 
    "Plot a line graph showing the sales trend over time with a 7-day moving average",
    "Create a histogram of quantities sold per transaction",
    "Generate a pie chart showing sales distribution across product classes",
    "Create a stacked bar chart showing promotional vs non-promotional sales by store",
    "Generate a heatmap of sales by day of week and store number",
    "Plot a line chart comparing sales trends between top 5 stores"
]

example_data = []
chart_configs = []
for question in tqdm(code_generation_questions[:], desc="Processing code generation questions"):
    try:
        with suppress_tracing():
            example_data.append(lookup_sales_data(question))
            chart_configs.append(json.dumps(extract_chart_config(example_data[-1], question)))
    except Exception as e:
        print(f"Error processing question: {question}")
        print(e)
        code_generation_questions.remove(question)

code_generation_df = pd.DataFrame({
    'question': code_generation_questions,
    'example_data': example_data,
    'chart_configs': chart_configs
})

dataset = px_client.upload_dataset(dataframe=code_generation_df, dataset_name="code_generation_questions", input_keys=["question", "example_data", "chart_configs"])




[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


Processing code generation questions: 100%|██████████| 8/8 [00:20<00:00,  2.51s/it]

📤 Uploading dataset...
💾 Examples uploaded: http://localhost:6006/datasets/RGF0YXNldDoz/examples
🗄️ Dataset version ID: RGF0YXNldFZlcnNpb246Mw==





In [46]:
def run_code_generation(example: Example) -> str:
    with suppress_tracing():
        chart_config = extract_chart_config(data=example.input.get("example_data"), visualization_goal=example.input.get("question"))
        code = generate_visualization(visualization_goal=example.input.get("question"), 
                                    data=example.input.get("example_data"))
    
    return {"code": code, "chart_config": chart_config}

In [38]:
def code_is_runnable(output: str) -> bool:
    """Check if the code is runnable"""
    output = output.get("code")
    output = output.strip()
    output = output.replace("```python", "").replace("```", "")
    try:
        exec(output)
        return True
    except Exception as e:
        return False


In [39]:
def evaluate_chart_config(output: str, expected: str) -> bool:
    return output.get("chart_config") == expected.get("chart_config")

In [47]:
experiment = run_experiment(dataset,
                            run_code_generation,
                            evaluators=[code_is_runnable, evaluate_chart_config],
                            experiment_name="Code Generation Eval",
                            experiment_description="Evaluating the code generation step of the agent")

# Evaluating the agent path and convergence

In [28]:
# Replace this with a human annotated set of ground truth data, instead of generated examples

convergence_questions = [
    "What was the average quantity sold per transaction?",
    "What is the mean number of items per sale?", 
    "Calculate the typical quantity per transaction",
    "Show me the average number of units sold in each transaction",
    "What's the mean transaction size in terms of quantity?",
    "On average, how many items were purchased per transaction?",
    "What is the average basket size per sale?",
    "Calculate the mean number of products per purchase",
    "What's the typical number of units per order?",
    "Find the average quantity of items in each transaction",
    "What is the average number of products bought per purchase?",
    "Tell me the mean quantity of items in a typical transaction",
    "How many items does a customer buy on average per transaction?",
    "What's the usual number of units in each sale?",
    "Calculate the average basket quantity per order",
    "What is the typical amount of products per transaction?",
    "Show the mean number of items customers purchase per visit",
    "What's the average quantity of units per shopping trip?",
    "How many products do customers typically buy in one transaction?",
    "What is the standard basket size in terms of quantity?"
]

convergence_df = pd.DataFrame({
    'question': convergence_questions
})

dataset = px_client.upload_dataset(dataframe=convergence_df, dataset_name="convergence_questions", input_keys=["question"])

📤 Uploading dataset...
💾 Examples uploaded: http://localhost:6006/datasets/RGF0YXNldDoy/examples
🗄️ Dataset version ID: RGF0YXNldFZlcnNpb246Mg==


In [29]:
def format_message_steps(messages):
    """
    Convert a list of message objects into a readable format that shows the steps taken.

    Args:
        messages (list): A list of message objects containing role, content, tool calls, etc.

    Returns:
        str: A readable string showing the steps taken.
    """
    steps = []
    for message in messages:
        role = message.get("role")
        if role == "user":
            steps.append(f"User: {message.get('content')}")
        elif role == "system":
            steps.append("System: Provided context")
        elif role == "assistant":
            if message.get("tool_calls"):
                for tool_call in message["tool_calls"]:
                    tool_name = tool_call["function"]["name"]
                    steps.append(f"Assistant: Called tool '{tool_name}'")
            else:
                steps.append(f"Assistant: {message.get('content')}")
        elif role == "tool":
            steps.append(f"Tool response: {message.get('content')}")
    
    return "\n".join(steps)

In [30]:
def run_agent_and_track_path(example: Example) -> str:
    print("Starting main span with messages:", example.input.get("question"))
    messages = [{"role": "user", "content": example.input.get("question")}]
    ret = run_agent_messages(messages)
    return {"path_length": len(ret), "messages": format_message_steps(ret)}

def run_agent_messages(messages):
    print("Running agent with messages:", messages)
    if isinstance(messages, str):
        messages = [{"role": "user", "content": messages}]
        print("Converted string message to list format")
    
    # Check and add system prompt if needed
    if not any(
            isinstance(message, dict) and message.get("role") == "system" for message in messages
        ):
            system_prompt = {"role": "system", "content": "You are a helpful assistant that can answer questions about the Store Sales Price Elasticity Promotions dataset."}
            messages.append(system_prompt)
            print("Added system prompt to messages")

    while True:
        # Router call span
        print("Starting router")
            
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            tools=tools,
        )

        messages.append(response.choices[0].message.model_dump())
        tool_calls = response.choices[0].message.tool_calls
        print("Received response with tool calls:", bool(tool_calls))
        
        if tool_calls:
            # Tool calls span
            print("Processing tool calls")
            tool_calls = response.choices[0].message.tool_calls
            messages = handle_tool_calls(tool_calls, messages)
        else:
            print("No tool calls, returning final response")
            return messages


In [31]:
experiment = run_experiment(dataset,
                            run_agent_and_track_path,
                            experiment_name="Convergence Eval",
                            experiment_description="Evaluating the convergence of the agent")

🧪 Experiment started.
📺 View dataset experiments: http://localhost:6006/datasets/RGF0YXNldDoy/experiments
🔗 View this experiment: http://localhost:6006/datasets/RGF0YXNldDoy/compare?experimentId=RXhwZXJpbWVudDox


I0000 00:00:1737727433.065744  608053 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


running tasks |          | 0/20 (0.0%) | ⏳ 00:00<? | ?it/s

Starting main span with messages: What was the average quantity sold per transaction?
Running agent with messages: [{'role': 'user', 'content': 'What was the average quantity sold per transaction?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Starting main span with messages: What is the mean number of items per sale?
Running agent with messages: [{'role': 'user', 'content': 'What is the mean number of items per sale?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Starting main span with messages: Calculate the typical quantity per transaction
Running agent with messages: [{'role': 'user', 'content': 'Calculate the typical quantity per transaction'}]
Added system p

In [24]:
experiment.as_dataframe()

Unnamed: 0_level_0,error,output,input,example_id
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RXhwZXJpbWVudFJ1bjox,,"{'path_length': 5, 'messages': 'User: What was...",{'question': 'What was the average quantity so...,RGF0YXNldEV4YW1wbGU6MQ==
RXhwZXJpbWVudFJ1bjoy,,"{'path_length': 5, 'messages': 'User: What is ...",{'question': 'What is the mean number of items...,RGF0YXNldEV4YW1wbGU6Mg==
RXhwZXJpbWVudFJ1bjoz,,"{'path_length': 5, 'messages': 'User: Calculat...",{'question': 'Calculate the typical quantity p...,RGF0YXNldEV4YW1wbGU6Mw==
RXhwZXJpbWVudFJ1bjo0,BadRequestError('Error code: 400 - {\'error\':...,,{'question': 'Show me the average number of un...,RGF0YXNldEV4YW1wbGU6NA==
RXhwZXJpbWVudFJ1bjo1,,"{'path_length': 5, 'messages': 'User: What's t...",{'question': 'What's the mean transaction size...,RGF0YXNldEV4YW1wbGU6NQ==
RXhwZXJpbWVudFJ1bjo2,,"{'path_length': 5, 'messages': 'User: On avera...","{'question': 'On average, how many items were ...",RGF0YXNldEV4YW1wbGU6Ng==
RXhwZXJpbWVudFJ1bjo3,,"{'path_length': 5, 'messages': 'User: What is ...",{'question': 'What is the average basket size ...,RGF0YXNldEV4YW1wbGU6Nw==
RXhwZXJpbWVudFJ1bjo4,,"{'path_length': 5, 'messages': 'User: Calculat...",{'question': 'Calculate the mean number of pro...,RGF0YXNldEV4YW1wbGU6OA==
RXhwZXJpbWVudFJ1bjo5,,"{'path_length': 5, 'messages': 'User: What's t...",{'question': 'What's the typical number of uni...,RGF0YXNldEV4YW1wbGU6OQ==
RXhwZXJpbWVudFJ1bjoxMA==,BadRequestError('Error code: 400 - {\'error\':...,,{'question': 'Find the average quantity of ite...,RGF0YXNldEV4YW1wbGU6MTA=


In [32]:
outputs = experiment.as_dataframe()["output"].to_dict().values()
optimal_path_length = min(output.get('path_length') for output in outputs if output and output.get('path_length') is not None)
print(f"The optimal path length is {optimal_path_length}")

The optimal path length is 5


In [33]:
@create_evaluator(name="Convergence Eval", kind="CODE")
def evaluate_path_length(output: str) -> float:
    if output and output.get("path_length"):
        return optimal_path_length/float(output.get("path_length"))
    else:
        return 0

In [34]:
experiment = evaluate_experiment(experiment,
                            evaluators=[evaluate_path_length])

🧠 Evaluation started.


running experiment evaluations |          | 0/20 (0.0%) | ⏳ 00:00<? | ?it/s


🔗 View this experiment: http://localhost:6006/datasets/RGF0YXNldDoy/compare?experimentId=RXhwZXJpbWVudDox

Experiment Summary (01/24/25 09:09 AM -0500)
--------------------------------------------
| evaluator        |   n |   n_scores |   avg_score |
|:-----------------|----:|-----------:|------------:|
| Convergence Eval |  20 |         20 |         0.9 |

Tasks Summary (01/24/25 09:05 AM -0500)
---------------------------------------
|   n_examples |   n_runs |   n_errors | top_error                                                                                            |
|-------------:|---------:|-----------:|:-----------------------------------------------------------------------------------------------------|
|           20 |       20 |          2 | BadRequestError('Error code: 400 - {\'error\': {\'message\': "This model\'s maximum context length i |


# Combining all the evals into our experiment

### Build a version of our agent that tracks all the necessary information for evals

In [72]:
def process_messages(messages):
    tool_calls = []
    tool_responses = []
    final_output = None

    for i, message in enumerate(messages):
        # Extract tool calls
        if 'tool_calls' in message and message['tool_calls']:
            for tool_call in message['tool_calls']:
                tool_name = tool_call['function']['name']
                tool_input = tool_call['function']['arguments']
                tool_calls.append(tool_name)

                # Prepare tool response structure with tool name and input
                tool_responses.append({
                    "tool_name": tool_name,
                    "tool_input": tool_input,
                    "tool_response": None
                })

        # Extract tool responses
        if message['role'] == 'tool' and 'tool_call_id' in message:
            for tool_response in tool_responses:
                if message['tool_call_id'] in message.values():
                    tool_response["tool_response"] = message['content']

        # Extract final output
        if message['role'] == 'assistant' and not message.get('tool_calls') and not message.get('function_call'):
            final_output = message['content']

    result = {
        "tool_calls": tool_calls,
        "tool_responses": tool_responses,
        "final_output": final_output,
        "unchanged_messages": messages,
        "path_length": len(messages)
    }

    return result

In [73]:
def run_agent_and_track_path(example: Example) -> str:
    print("Starting main span with messages:", example.input.get("question"))
    messages = [{"role": "user", "content": example.input.get("question")}]
    ret = run_agent_messages(messages)
    return process_messages(ret)

def run_agent_messages(messages):
    print("Running agent with messages:", messages)
    if isinstance(messages, str):
        messages = [{"role": "user", "content": messages}]
        print("Converted string message to list format")
    
    # Check and add system prompt if needed
    if not any(
            isinstance(message, dict) and message.get("role") == "system" for message in messages
        ):
            system_prompt = {"role": "system", "content": "You are a helpful assistant that can answer questions about the Store Sales Price Elasticity Promotions dataset."}
            messages.append(system_prompt)
            print("Added system prompt to messages")

    while True:
        # Router call span
        print("Starting router")
            
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            tools=tools,
        )

        messages.append(response.choices[0].message.model_dump())
        tool_calls = response.choices[0].message.tool_calls
        print("Received response with tool calls:", bool(tool_calls))
        
        if tool_calls:
            # Tool calls span
            print("Processing tool calls")
            tool_calls = response.choices[0].message.tool_calls
            messages = handle_tool_calls(tool_calls, messages)
        else:
            print("No tool calls, returning final response")
            return messages


In [10]:
generate_sql_query("What was the most popular product SKU?", store_sales_df.columns, "sales")

'```sql\nSELECT SKU_Coded, SUM(Qty_Sold) AS Total_Qty_Sold\nFROM sales\nGROUP BY SKU_Coded\nORDER BY Total_Qty_Sold DESC\nLIMIT 1;\n```'

In [12]:
overall_experiment_questions = [
    {"question": "What was the most popular product SKU?",  "sql_result": "   SKU_Coded  Total_Qty_Sold 0    6200700         52262.0"}, 
    {"question": "What was the total revenue across all stores?", "sql_result": "   Total_Revenue 0   1.327264e+07"},
    {"question": "Which store had the highest sales volume?", "sql_result": "   Store_Number  Total_Sales_Volume 0          2970             59322.0"},
    {"question": "Create a bar chart showing total sales by store", "sql_result": "    Store_Number    Total_Sales 0            880  420302.088397 1           1650  580443.007953 2           4180  272208.118542 3            550  229727.498752 4           1100  497509.528013 5           3300  619660.167018 6           3190  335035.018792 7           2970  836341.327191 8           3740  359729.808228 9           2530  324046.518720 10          4400   95745.620250 11          1210  508393.767785 12           330  370503.687331 13          2750  453664.808068 14          1980  242290.828499 15          1760  350747.617798 16          3410  410567.848126 17           990  378433.018639 18          4730  239711.708869 19          4070  322307.968330 20          3080  495458.238811 21          2090  309996.247965 22          1320  592832.067579 23          2640  308990.318559 24          1540  427777.427815 25          4840  389056.668316 26          2860  132320.519487 27          2420  406715.767402 28           770  292968.918642 29          3520  145701.079372 30           660  343594.978075 31          3630  405034.547846 32          2310  412579.388504 33          2200  361173.288199 34          1870  401070.997685"},
    {"question": "What percentage of items were sold on promotion?", "sql_result": "   Promotion_Percentage 0              0.625596"},
    {"question": "What was the average transaction value?", "sql_result": "   Average_Transaction_Value 0                  19.018132"},
    {"question": "Create a line chart showing sales in 2021", "sql_result": "  sale_month  total_quantity_sold  total_sales_value 0 2021-11-01              43056.0      499984.428193 1 2021-12-01              75724.0      910982.118423"}
]

overall_experiment_questions[0]["sql_generated"] = generate_sql_query(overall_experiment_questions[0]["question"], store_sales_df.columns, "sales")
overall_experiment_questions[1]["sql_generated"] = generate_sql_query(overall_experiment_questions[1]["question"], store_sales_df.columns, "sales")
overall_experiment_questions[2]["sql_generated"] = generate_sql_query(overall_experiment_questions[2]["question"], store_sales_df.columns, "sales")
overall_experiment_questions[3]["sql_generated"] = generate_sql_query(overall_experiment_questions[3]["question"], store_sales_df.columns, "sales")
overall_experiment_questions[4]["sql_generated"] = generate_sql_query(overall_experiment_questions[4]["question"], store_sales_df.columns, "sales")
overall_experiment_questions[5]["sql_generated"] = generate_sql_query(overall_experiment_questions[5]["question"], store_sales_df.columns, "sales")
overall_experiment_questions[6]["sql_generated"] = generate_sql_query(overall_experiment_questions[6]["question"], store_sales_df.columns, "sales")

print(overall_experiment_questions[6])

# overall_experiment_df = pd.DataFrame(overall_experiment_questions)

# dataset = px_client.upload_dataset(dataframe=overall_experiment_df, dataset_name="overall_experiment_questions_all", input_keys=["question"], output_keys=["sql_result"])

[{'question': 'What was the most popular product SKU?', 'sql_result': '   SKU_Coded  Total_Qty_Sold 0    6200700         52262.0', 'sql_generated': '```sql\nSELECT SKU_Coded, SUM(Qty_Sold) AS Total_Qty_Sold\nFROM sales\nGROUP BY SKU_Coded\nORDER BY Total_Qty_Sold DESC\nLIMIT 1;\n```'}, {'question': 'What was the total revenue across all stores?', 'sql_result': '   Total_Revenue 0   1.327264e+07', 'sql_generated': '```sql\nSELECT SUM(Total_Sale_Value) AS Total_Revenue\nFROM sales;\n```'}, {'question': 'Which store had the highest sales volume?', 'sql_result': '   Store_Number  Total_Sales_Volume 0          2970             59322.0', 'sql_generated': '```sql\nSELECT Store_Number, SUM(Total_Sale_Value) AS Total_Sales\nFROM sales\nGROUP BY Store_Number\nORDER BY Total_Sales DESC\nLIMIT 1;\n```'}, {'question': 'Create a bar chart showing total sales by store', 'sql_result': '    Store_Number    Total_Sales 0            880  420302.088397 1           1650  580443.007953 2           4180  272

In [13]:
print(overall_experiment_questions[6])

{'question': 'Create a line chart showing sales in 2021', 'sql_result': '  sale_month  total_quantity_sold  total_sales_value 0 2021-11-01              43056.0      499984.428193 1 2021-12-01              75724.0      910982.118423', 'sql_generated': '```sql\nSELECT MONTH(Sold_Date) AS Month, SUM(Total_Sale_Value) AS Total_Sales\nFROM sales\nWHERE YEAR(Sold_Date) = 2021\nGROUP BY MONTH(Sold_Date)\nORDER BY MONTH(Sold_Date);\n```'}


In [None]:
[
    {'question': 'What was the most popular product SKU?', 'sql_result': '   SKU_Coded  Total_Qty_Sold 0    6200700         52262.0', 'sql_generated': '```sql\nSELECT SKU_Coded, SUM(Qty_Sold) AS Total_Qty_Sold\nFROM sales\nGROUP BY SKU_Coded\nORDER BY Total_Qty_Sold DESC\nLIMIT 1;\n```'},
    {'question': 'What was the total revenue across all stores?', 'sql_result': '   Total_Revenue 0   1.327264e+07', 'sql_generated': '```sql\nSELECT SUM(Total_Sale_Value) AS Total_Revenue\nFROM sales;\n```'},
    {'question': 'Which store had the highest sales volume?', 'sql_result': '   Store_Number  Total_Sales_Volume 0          2970             59322.0', 'sql_generated': '```sql\nSELECT Store_Number, SUM(Total_Sale_Value) AS Total_Sales_Volume\nFROM sales\nGROUP BY Store_Number\nORDER BY Total_Sales_Volume DESC\nLIMIT 1;\n```'},
    {'question': 'Create a bar chart showing total sales by store', 'sql_result': '    Store_Number    Total_Sales 0            880  420302.088397 1           1650  580443.007953 2           4180  272208.118542 3            550  229727.498752 4           1100  497509.528013 5           3300  619660.167018 6           3190  335035.018792 7           2970  836341.327191 8           3740  359729.808228 9           2530  324046.518720 10          4400   95745.620250 11          1210  508393.767785 12           330  370503.687331 13          2750  453664.808068 14          1980  242290.828499 15          1760  350747.617798 16          3410  410567.848126 17           990  378433.018639 18          4730  239711.708869 19          4070  322307.968330 20          3080  495458.238811 21          2090  309996.247965 22          1320  592832.067579 23          2640  308990.318559 24          1540  427777.427815 25          4840  389056.668316 26          2860  132320.519487 27          2420  406715.767402 28           770  292968.918642 29          3520  145701.079372 30           660  343594.978075 31          3630  405034.547846 32          2310  412579.388504 33          2200  361173.288199 34          1870  401070.997685', 'sql_generated': '```sql\nSELECT Store_Number, SUM(Total_Sale_Value) AS Total_Sales\nFROM sales\nGROUP BY Store_Number;\n```'},
    {'question': 'What percentage of items were sold on promotion?', 'sql_result': '   Promotion_Percentage 0              0.625596', 'sql_generated': "```sql\nSELECT \n    (SUM(CASE WHEN On_Promo = 'Yes' THEN 1 ELSE 0 END) * 100.0) / COUNT(*) AS Promotion_Percentage\nFROM \n    sales;\n```"},
    {'question': 'What was the average transaction value?', 'sql_result': '   Average_Transaction_Value 0                  19.018132', 'sql_generated': '```sql\nSELECT AVG(Total_Sale_Value) AS Average_Transaction_Value\nFROM sales;\n```'},
    {'question': 'Create a line chart showing sales in 2021', 'sql_result': '  sale_month  total_quantity_sold  total_sales_value 0 2021-11-01              43056.0      499984.428193 1 2021-12-01              75724.0      910982.118423', 'sql_generated': '```sql\nSELECT MONTH(Sold_Date) AS Month, SUM(Total_Sale_Value) AS Total_Sales\nFROM sales\nWHERE YEAR(Sold_Date) = 2021\nGROUP BY MONTH(Sold_Date)\nORDER BY MONTH(Sold_Date);\n```'}
]


In [65]:
CLARITY_LLM_JUDGE_PROMPT = """
In this task, you will be presented with a query and an answer. Your objective is to evaluate the clarity 
of the answer in addressing the query. A clear response is one that is precise, coherent, and directly 
addresses the query without introducing unnecessary complexity or ambiguity. An unclear response is one 
that is vague, disorganized, or difficult to understand, even if it may be factually correct.

Your response should be a single word: either "clear" or "unclear," and it should not include any other 
text or characters. "clear" indicates that the answer is well-structured, easy to understand, and 
appropriately addresses the query. "unclear" indicates that the answer is ambiguous, poorly organized, or 
not effectively communicated. Please carefully consider the query and answer before determining your 
response.

After analyzing the query and the answer, you must write a detailed explanation of your reasoning to 
justify why you chose either "clear" or "unclear." Avoid stating the final label at the beginning of your 
explanation. Your reasoning should include specific points about how the answer does or does not meet the 
criteria for clarity.

[BEGIN DATA]
Query: {query}
Answer: {response}
[END DATA]
Please analyze the data carefully and provide an explanation followed by your response.

EXPLANATION: Provide your reasoning step by step, evaluating the clarity of the answer based on the query.
LABEL: "clear" or "unclear"
"""

ENTITY_CORRECTNESS_LLM_JUDGE_PROMPT = """
In this task, you will be presented with a query and an answer. Your objective is to determine whether all 
the entities mentioned in the answer are correctly identified and accurately match those in the query. An 
entity refers to any specific person, place, organization, date, or other proper noun. Your evaluation 
should focus on whether the entities in the answer are correctly named and appropriately associated with 
the context in the query.

Your response should be a single word: either "correct" or "incorrect," and it should not include any 
other text or characters. "correct" indicates that all entities mentioned in the answer match those in the 
query and are properly identified. "incorrect" indicates that the answer contains errors or mismatches in 
the entities referenced compared to the query.

After analyzing the query and the answer, you must write a detailed explanation of your reasoning to 
justify why you chose either "correct" or "incorrect." Avoid stating the final label at the beginning of 
your explanation. Your reasoning should include specific points about how the entities in the answer do or 
do not match the entities in the query.

[BEGIN DATA]
Query: {query}
Answer: {response}
[END DATA]
Please analyze the data carefully and provide an explanation followed by your response.

EXPLANATION: Provide your reasoning step by step, evaluating whether the entities in the answer are 
correct and consistent with the query.
LABEL: "correct" or "incorrect"
"""

In [86]:
TOOL_CALLING_PROMPT_TEMPLATE.template.replace("{tool_definitions}", json.dumps(tools))

'\nYou are an evaluation assistant evaluating questions and tool calls to\ndetermine whether the tool called would answer the question. The tool\ncalls have been generated by a separate agent, and chosen from the list of\ntools provided below. It is your job to decide whether that agent chose\nthe right tool to call.\n\n    [BEGIN DATA]\n    ************\n    [Question]: {question}\n    ************\n    [Tool Called]: {tool_call}\n    [END DATA]\n\nYour response must be single word, either "correct" or "incorrect",\nand should not contain any text or characters aside from that word.\n"incorrect" means that the chosen tool would not answer the question,\nthe tool includes information that is not presented in the question,\nor that the tool signature includes parameter values that don\'t match\nthe formats specified in the tool signatures below.\n\n"correct" means the correct tool call was chosen, the correct parameters\nwere extracted from the question, the tool call generated is runna

In [89]:
def function_calling_eval(input: str, output: str) -> float:
    function_calls = output.get("tool_calls")
    if function_calls:
        eval_df = pd.DataFrame({
            "question": [input.get("question")] * len(function_calls),
            "tool_call": function_calls
        })
            
        tool_call_eval = llm_classify(
            dataframe = eval_df,
            template = TOOL_CALLING_PROMPT_TEMPLATE.template.replace("{tool_definitions}", json.dumps(tools).replace("{", '"').replace("}", '"')),
            rails = ['correct', 'incorrect'],
            model=eval_model,
            provide_explanation=True
        )

        tool_call_eval['score'] = tool_call_eval.apply(lambda x: 1 if x['label']=='correct' else 0, axis=1)
        return tool_call_eval['score'].mean()
    else:
        return 0
    
def code_is_runnable(output: str) -> bool:
    """Check if the code is runnable"""
    generated_code = output.get("tool_responses")
    if not generated_code:
        return True
    
    # Find first lookup_sales_data response
    generated_code = next((r for r in generated_code if r.get("tool_name") == "generate_visualization"), None)
    if not generated_code:
        return True
        
    # Get the first response
    generated_code = generated_code.get("tool_response", "")
    generated_code = generated_code.strip()
    generated_code = generated_code.replace("```python", "").replace("```", "")
    try:
        exec(generated_code)
        return True
    except Exception as e:
        return False
    
def evaluate_sql_result(output, expected) -> bool:    
    sql_result = output.get("tool_responses")
    if not sql_result:
        return True
    
    # Find first lookup_sales_data response
    sql_result = next((r for r in sql_result if r.get("tool_name") == "lookup_sales_data"), None)
    if not sql_result:
        return True
        
    # Get the first response
    sql_result = sql_result.get("tool_response", "")

    # Extract just the numbers from both strings
    result_nums = ''.join(filter(str.isdigit, sql_result))
    expected_nums = ''.join(filter(str.isdigit, expected.get("sql_result")))
    return result_nums == expected_nums

def evaluate_clarity(output: str, input: str) -> bool:
    df = pd.DataFrame({"query": [input.get("question")], "response": [output.get("final_output")]})
    response = llm_classify(
        dataframe=df,
        template=CLARITY_LLM_JUDGE_PROMPT,
        rails=["clear", "unclear"],
        model=eval_model,
        provide_explanation=True
    )
    return response['label'] == 'clear'
    
def evaluate_entity_correctness(output: str, input: str) -> bool:
    df = pd.DataFrame({"query": [input.get("question")], "response": [output.get("final_output")]})
    response = llm_classify(
        dataframe=df,
        template=ENTITY_CORRECTNESS_LLM_JUDGE_PROMPT,
        rails=["correct", "incorrect"],
        model=eval_model,
        provide_explanation=True
    )
    return response['label'] == 'correct'


In [90]:
def run_overall_experiment(example: Example) -> str:
    with suppress_tracing():
        return run_agent_and_track_path(example)

experiment = run_experiment(dataset,
                            run_overall_experiment,
                            evaluators=[function_calling_eval, evaluate_sql_result, evaluate_clarity, evaluate_entity_correctness, code_is_runnable],
                            experiment_name="Overall Experiment",
                            experiment_description="Evaluating the overall experiment")

🧪 Experiment started.
📺 View dataset experiments: http://localhost:6006/datasets/RGF0YXNldDo4/experiments
🔗 View this experiment: http://localhost:6006/datasets/RGF0YXNldDo4/compare?experimentId=RXhwZXJpbWVudDoxNQ==


running tasks |          | 0/6 (0.0%) | ⏳ 00:00<? | ?it/s

Starting main span with messages: What was the most popular product SKU?
Running agent with messages: [{'role': 'user', 'content': 'What was the most popular product SKU?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Starting main span with messages: What was the total revenue across all stores?
Running agent with messages: [{'role': 'user', 'content': 'What was the total revenue across all stores?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Starting main span with messages: Which store had the highest sales volume?
Running agent with messages: [{'role': 'user', 'content': 'Which store had the highest sales volume?'}]
Added system prompt to messages
Starting rou

running experiment evaluations |          | 0/30 (0.0%) | ⏳ 00:00<? | ?it/s

llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

  return EvaluationResult(score=float(result))


llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

  return EvaluationResult(score=float(result))


llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

  return EvaluationResult(score=float(result))
  return EvaluationResult(score=float(result))


llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

llm_classify |          | 0/2 (0.0%) | ⏳ 00:00<? | ?it/s

llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

  return EvaluationResult(score=float(result))


llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

  return EvaluationResult(score=float(result))


llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

  return EvaluationResult(score=float(result))


llm_classify |          | 0/1 (0.0%) | ⏳ 00:00<? | ?it/s

  return EvaluationResult(score=float(result))



🔗 View this experiment: http://localhost:6006/datasets/RGF0YXNldDo4/compare?experimentId=RXhwZXJpbWVudDoxNQ==

Experiment Summary (01/24/25 10:29 AM -0500)
--------------------------------------------
| evaluator                   |   n |   n_scores |   avg_score |   n_labels | top_2_labels            |
|:----------------------------|----:|-----------:|------------:|-----------:|:------------------------|
| code_is_runnable            |   6 |          6 |    0.833333 |          6 | {'True': 5, 'False': 1} |
| evaluate_clarity            |   6 |          6 |    1        |          0 |                         |
| evaluate_entity_correctness |   6 |          6 |    1        |          0 |                         |
| evaluate_sql_result         |   6 |          6 |    0.666667 |          6 | {'True': 4, 'False': 2} |
| function_calling_eval       |   6 |          6 |    0.916667 |          0 |                         |

Tasks Summary (01/24/25 10:28 AM -0500)
-----------------------------

<Figure size 1200x600 with 0 Axes>