In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import phoenix as px
import os
import json
from tqdm import tqdm
from phoenix.evals import (
    TOOL_CALLING_PROMPT_TEMPLATE, 
    llm_classify,
    OpenAIModel
)
from phoenix.trace import SpanEvaluations
from phoenix.trace.dsl import SpanQuery
from openinference.instrumentation import suppress_tracing

import nest_asyncio
nest_asyncio.apply()

In [3]:
from init_phoenix import init_phoenix
PROJECT_NAME = "evaluating-agent"

client, tool_calling_client, tracer = init_phoenix(project_name=PROJECT_NAME)

OpenTelemetry Tracing Details
|  Phoenix Project: evaluating-agent
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: http://localhost:6006/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [4]:
from tutorial1_code import Agent

In [5]:
agent = Agent(client, tool_calling_client, tracer)

In [6]:
agent_questions = [
    "What was the most popular product SKU?",
    "What was the total revenue across all stores?",
    "Which store had the highest sales volume?",
    "Create a bar chart showing total sales by store",
    "What percentage of items were sold on promotion?",
    "What was the average transaction value?"
]

for question in tqdm(agent_questions, desc="Processing questions"):
    try:
        ret = agent.start_main_span([{"role": "user", "content": question}])
    except Exception as e:
        print(f"Error processing question: {question}")
        print(e)
        continue

Processing questions:   0%|          | 0/6 [00:00<?, ?it/s]

Starting main span with messages: [{'role': 'user', 'content': 'What was the most popular product SKU?'}]
Running agent with messages: [{'role': 'user', 'content': 'What was the most popular product SKU?'}]
Starting router call span
Making router call to OpenAI
Received response with tool calls: True
Starting tool calls span
lookup_sales_data
Starting router call span
Making router call to OpenAI
Received response with tool calls: False
No tool calls, returning final response


Processing questions:  17%|█▋        | 1/6 [00:07<00:38,  7.77s/it]

Main span completed with return value: code=None observed_trends=None natural_language_response='The most popular product SKU was **6200700**, with a total quantity sold of **52,262** units.'
Starting main span with messages: [{'role': 'user', 'content': 'What was the total revenue across all stores?'}]
Running agent with messages: [{'role': 'user', 'content': 'What was the total revenue across all stores?'}]
Starting router call span
Making router call to OpenAI
Received response with tool calls: True
Starting tool calls span
lookup_sales_data
Starting router call span
Making router call to OpenAI
Received response with tool calls: False
No tool calls, returning final response


Processing questions:  33%|███▎      | 2/6 [00:12<00:24,  6.17s/it]

Main span completed with return value: code=None observed_trends=None natural_language_response='The total revenue across all stores was approximately $13,272,640.'
Starting main span with messages: [{'role': 'user', 'content': 'Which store had the highest sales volume?'}]
Running agent with messages: [{'role': 'user', 'content': 'Which store had the highest sales volume?'}]
Starting router call span
Making router call to OpenAI
Received response with tool calls: True
Starting tool calls span
lookup_sales_data
Starting router call span
Making router call to OpenAI
Received response with tool calls: False
No tool calls, returning final response


Processing questions:  50%|█████     | 3/6 [00:19<00:18,  6.29s/it]

Main span completed with return value: code=None observed_trends=None natural_language_response='The store with the highest sales volume is Store Number 2970, with total sales of approximately 836,341.33.'
Starting main span with messages: [{'role': 'user', 'content': 'Create a bar chart showing total sales by store'}]
Running agent with messages: [{'role': 'user', 'content': 'Create a bar chart showing total sales by store'}]
Starting router call span
Making router call to OpenAI
Received response with tool calls: True
Starting tool calls span
lookup_sales_data
Starting router call span
Making router call to OpenAI
Received response with tool calls: True
Starting tool calls span
generate_visualization
Starting router call span
Making router call to OpenAI
Received response with tool calls: False
No tool calls, returning final response


Processing questions:  67%|██████▋   | 4/6 [00:50<00:32, 16.33s/it]

Main span completed with return value: code="import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Data\ndata = {'Store_Number': [2860, 2420, 770, 3520, 660, 3410, 990, 1210, 330, 2750, 1980, 1760, 4730, 4070, 3080, 2090, 1320, 2640, 1540, 4840, 1100, 3300, 3190, 3630, 2310, 2200, 1870, 2970, 3740, 2530, 4400, 880, 1650, 4180, 550],\n        'Total_Sales': [132320.519487, 406715.767402, 292968.918642, 145701.079372, 343594.978075, 410567.848126, 378433.018639, 508393.767785, 370503.687331, 453664.808068, 242290.828499, 350747.617798, 239711.708869, 322307.968330, 495458.238811, 309996.247965, 592832.067579, 308990.318559, 427777.427815, 389056.668316, 497509.528013, 619660.167018, 335035.018792, 405034.547846, 412579.388504, 361173.288199, 401070.997685, 836341.327191, 359729.808228, 324046.518720, 95745.620250, 420302.088397, 580443.007953, 272208.118542, 229727.498752]}\n\n# Create DataFrame\ndf = pd.DataFrame(data)\n\n# Plot\nplt.figure(figsize=(12, 6))\nplt.bar(df['Store_Number

Processing questions:  83%|████████▎ | 5/6 [01:00<00:14, 14.03s/it]

Main span completed with return value: code=None observed_trends='Approximately 2.63% of items were sold on promotion.' natural_language_response='Approximately 2.63% of items were sold on promotion.'
Starting main span with messages: [{'role': 'user', 'content': 'What was the average transaction value?'}]
Running agent with messages: [{'role': 'user', 'content': 'What was the average transaction value?'}]
Starting router call span
Making router call to OpenAI
Received response with tool calls: True
Starting tool calls span
lookup_sales_data
Starting router call span
Making router call to OpenAI
Received response with tool calls: False
No tool calls, returning final response


Processing questions: 100%|██████████| 6/6 [01:05<00:00, 10.86s/it]

Main span completed with return value: code=None observed_trends=None natural_language_response='The average transaction value was approximately $19.02.'



