In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import phoenix as px
from phoenix.evals import OpenAIModel
from phoenix.experiments import run_experiment, evaluate_experiment
from phoenix.experiments.types import Example
from phoenix.experiments.evaluators import create_evaluator
from phoenix.otel import register
import pandas as pd
from datetime import datetime
import os
import nest_asyncio
nest_asyncio.apply()

In [3]:
from utils2 import run_agent
from helper import get_phoenix_endpoint

🔭 OpenTelemetry Tracing Details 🔭
|  Phoenix Project: evaluating-agent-path
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: http://localhost:6006/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [4]:
px_client = px.Client()

In [5]:
convergence_questions = [
    "What was the average quantity sold per transaction?",
    "What is the mean number of items per sale?", 
    "Calculate the typical quantity per transaction",
    "What's the mean transaction size in terms of quantity?",
    "On average, how many items were purchased per transaction?",
    "What is the average basket size per sale?",
    "Calculate the mean number of products per purchase",
    "What's the typical number of units per order?",
    "What is the average number of products bought per purchase?",
    "Tell me the mean quantity of items in a typical transaction",
    "How many items does a customer buy on average per transaction?",
    "What's the usual number of units in each sale?",
    "What is the typical amount of products per transaction?",
    "Show the mean number of items customers purchase per visit",
    "What's the average quantity of units per shopping trip?",
    "How many products do customers typically buy in one transaction?",
    "What is the standard basket size in terms of quantity?"
]

convergence_df = pd.DataFrame({
    'question' : convergence_questions
})

now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dataset = px_client.upload_dataset(dataframe=convergence_df , dataset_name=f"convergence_questions-{now}" , input_keys=["question"])

📤 Uploading dataset...
💾 Examples uploaded: http://127.0.0.1:6006/datasets/RGF0YXNldDox/examples
🗄️ Dataset version ID: RGF0YXNldFZlcnNpb246MQ==


Creating the Task

In [6]:
# helper method to format the output returned by the task
def format_message_steps(messages):
    """
    Convert a list of message objects into a readable format that shows the steps taken.

    Args:
        messages (list): A list of message objects containing role, content, tool calls, etc.

    Returns:
        str: A readable string showing the steps taken.
    """
    steps = []
    for message in messages:
        role = message.get("role")
        if role == "user":
            steps.append(f"User: {message.get('content')}")
        elif role == "system":
            steps.append("System: Provided context")
        elif role == "assistant":
            if message.get("tool_calls"):
                for tool_call in message["tool_calls"]:
                    tool_name = tool_call["function"]["name"]
                    steps.append(f"Assistant: Called tool '{tool_name}'")
            else:
                steps.append(f"Assistant: {message.get('content')}")
        elif role == "tool":
            steps.append(f"Tool response: {message.get('content')}")
    
    return "\n".join(steps)

In [7]:
def run_agent_and_track_path(example: Example) -> str:
    messages = [{"role": "user", "content": example.input.get("question")}]
    ret = run_agent(messages)
    return {"path_length": len(ret), "messages": format_message_steps(ret)}

Running the experiment

In [8]:
experiment = run_experiment(dataset , run_agent_and_track_path , experiment_name="convergence Eval",experiment_description="Evaluating the convergence of an agent")

🧪 Experiment started.
📺 View dataset experiments: http://127.0.0.1:6006/datasets/RGF0YXNldDox/experiments
🔗 View this experiment: http://127.0.0.1:6006/datasets/RGF0YXNldDox/compare?experimentId=RXhwZXJpbWVudDox


running tasks |          | 0/17 (0.0%) | ⏳ 00:00<? | ?it/s

Running agent with messages: [{'role': 'user', 'content': 'What was the average quantity sold per transaction?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': 'What is the mean number of items per sale?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': 'Calculate the typical quantity per transaction'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router


running tasks |▌         | 1/17 (5.9%) | ⏳ 00:09<02:33 |  9.61s/it

Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': "What's the mean transaction size in terms of quantity?"}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': 'On average, how many items were purchased per transaction?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router


running tasks |█▊        | 3/17 (17.6%) | ⏳ 00:15<01:04 |  4.58s/it

Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': 'What is the average basket size per sale?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': 'Calculate the mean number of products per purchase'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router


running tasks |██▉       | 5/17 (29.4%) | ⏳ 00:20<00:44 |  3.67s/it

Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': "What's the typical number of units per order?"}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': 'What is the average number of products bought per purchase?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router


running tasks |████      | 7/17 (41.2%) | ⏳ 00:27<00:34 |  3.44s/it

Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': 'Tell me the mean quantity of items in a typical transaction'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': 'How many items does a customer buy on average per transaction?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router


running tasks |█████▎    | 9/17 (52.9%) | ⏳ 00:33<00:26 |  3.26s/it

Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': "What's the usual number of units in each sale?"}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': 'What is the typical amount of products per transaction?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router


running tasks |██████▍   | 11/17 (64.7%) | ⏳ 00:38<00:18 |  3.02s/it

Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': 'Show the mean number of items customers purchase per visit'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': "What's the average quantity of units per shopping trip?"}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: True
Processing tool calls
Starting router


running tasks |███████▋  | 13/17 (76.5%) | ⏳ 00:45<00:12 |  3.23s/it

Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': 'How many products do customers typically buy in one transaction?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': 'What is the standard basket size in terms of quantity?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: True
Processing tool calls
Starting router


running tasks |████████▊ | 15/17 (88.2%) | ⏳ 00:53<00:06 |  3.45s/it

Received response with tool calls: False
No tool calls, returning final response


running tasks |██████████| 17/17 (100.0%) | ⏳ 00:54<00:00 |  3.20s/it

✅ Task runs completed.

🔗 View this experiment: http://127.0.0.1:6006/datasets/RGF0YXNldDox/compare?experimentId=RXhwZXJpbWVudDox

Tasks Summary (07/09/25 09:51 PM +0530)
---------------------------------------
   n_examples  n_runs  n_errors
0          17      17         0





In [9]:
experiment.as_dataframe()

Unnamed: 0_level_0,output,input,example_id
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RXhwZXJpbWVudFJ1bjox,"{'path_length': 5, 'messages': 'User: What was...",{'question': 'What was the average quantity so...,RGF0YXNldEV4YW1wbGU6MQ==
RXhwZXJpbWVudFJ1bjoy,"{'path_length': 5, 'messages': 'User: What is ...",{'question': 'What is the mean number of items...,RGF0YXNldEV4YW1wbGU6Mg==
RXhwZXJpbWVudFJ1bjoz,"{'path_length': 5, 'messages': 'User: Calculat...",{'question': 'Calculate the typical quantity p...,RGF0YXNldEV4YW1wbGU6Mw==
RXhwZXJpbWVudFJ1bjo0,"{'path_length': 5, 'messages': 'User: What's t...",{'question': 'What's the mean transaction size...,RGF0YXNldEV4YW1wbGU6NA==
RXhwZXJpbWVudFJ1bjo1,"{'path_length': 5, 'messages': 'User: On avera...","{'question': 'On average, how many items were ...",RGF0YXNldEV4YW1wbGU6NQ==
RXhwZXJpbWVudFJ1bjo2,"{'path_length': 5, 'messages': 'User: What is ...",{'question': 'What is the average basket size ...,RGF0YXNldEV4YW1wbGU6Ng==
RXhwZXJpbWVudFJ1bjo3,"{'path_length': 5, 'messages': 'User: Calculat...",{'question': 'Calculate the mean number of pro...,RGF0YXNldEV4YW1wbGU6Nw==
RXhwZXJpbWVudFJ1bjo4,"{'path_length': 5, 'messages': 'User: What's t...",{'question': 'What's the typical number of uni...,RGF0YXNldEV4YW1wbGU6OA==
RXhwZXJpbWVudFJ1bjo5,"{'path_length': 5, 'messages': 'User: What is ...",{'question': 'What is the average number of pr...,RGF0YXNldEV4YW1wbGU6OQ==
RXhwZXJpbWVudFJ1bjoxMA==,"{'path_length': 5, 'messages': 'User: Tell me ...",{'question': 'Tell me the mean quantity of ite...,RGF0YXNldEV4YW1wbGU6MTA=


Evaluating the Path

In [12]:
outputs = experiment.as_dataframe()["output"].to_dict().values()

#will include the user and system messages
optimal_path_length = min(output.get('path_length') for output in outputs if output and output.get('path_length') is not None)
print(f"The optimal path length is {optimal_path_length}")

The optimal path length is 5


In [13]:
@create_evaluator(name= "Convergence Eval", kind = "CODE")
def evaluate_path_length(output:str)->float:
    if output and output.get("path_length"):
        return optimal_path_length/float(output.get("path_length"))
    else :
        return 0

In [14]:
experiment = evaluate_experiment(experiment , evaluators=[evaluate_path_length])

🧠 Evaluation started.


running experiment evaluations |██████████| 17/17 (100.0%) | ⏳ 00:02<00:00 |  8.28it/s


🔗 View this experiment: http://127.0.0.1:6006/datasets/RGF0YXNldDox/compare?experimentId=RXhwZXJpbWVudDox

Experiment Summary (07/09/25 09:56 PM +0530)
--------------------------------------------
          evaluator   n  n_scores  avg_score
0  Convergence Eval  17        17   0.966387

Tasks Summary (07/09/25 09:51 PM +0530)
---------------------------------------
   n_examples  n_runs  n_errors
0          17      17         0



