<a href="https://colab.research.google.com/github/AndyJihang/Building-Code-Agents-with-Hugging-Face-smolagents/blob/main/Monitoring_and_Evaluating_your_Agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
PROJECT_NAME = "Customer-Success"

In [21]:
!pip -q install -U \
  "smolagents[openai,e2b,telemetry]>=1.6.0" \
  phoenix \
  e2b-code-interpreter \
  python-dotenv \
  pandas \
  requests \
  markdownify \
  ipywidgets \
  opentelemetry-sdk \
  opentelemetry-exporter-otlp \
  openinference-instrumentation-smolagents


In [22]:
import os
from dotenv import load_dotenv, find_dotenv
from phoenix.otel import register
from openinference.instrumentation.smolagents import SmolagentsInstrumentor

In [23]:
# 1) start Phoenix server in the background
!python -m phoenix.server.main serve --host 0.0.0.0 --port 6006 >/dev/null 2>&1 &
print("Phoenix UI → http://127.0.0.1:6006/")

# 2) (re)register tracing to the running endpoint
from phoenix.otel import register
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
tp = register(project_name="Customer-Success", endpoint="http://127.0.0.1:6006/v1/traces")
SmolagentsInstrumentor().instrument(tracer_provider=tp)


Phoenix UI → http://127.0.0.1:6006/




🔭 OpenTelemetry Tracing Details 🔭
|  Phoenix Project: Customer-Success
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: http://127.0.0.1:6006/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [24]:
# === Model: OpenAI ===
from smolagents import OpenAIServerModel, CodeAgent, tool
from google.colab import userdata

E2B_API_KEY = userdata.get("E2B_API_KEY")
OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")

model = OpenAIServerModel(
    model_id="gpt-4o-mini",
    api_key=OPENAI_API_KEY,
    api_base="https://api.openai.com/v1",
    temperature=0.2,
)

In [25]:
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
SmolagentsInstrumentor().uninstrument()  # remove OTel hooks


## Trace an agent run

In [26]:
# === Simple trace test ===
agent = CodeAgent(model=model, tools=[])
print(agent.run("What is the 100th Fibonacci number?", max_steps=1))

print("Phoenix UI:", phoenix_url)

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.12/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 787, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 493, in _make_request
    conn.request(
  File "/usr/local/li

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.12/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 787, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 493, in _make_request
    conn.request(
  File "/usr/local/li

The 100th Fibonacci number is 354,224,848,179,261,915,075.
Phoenix UI: http://127.0.0.1:6006/


## Setup ice cream production system

In [32]:
from smolagents import CodeAgent, tool
from typing import Dict

menu_prices = {"crepe nutella": 1.50, "vanilla ice cream": 2, "maple pancake": 1.0}
ORDER_BOOK = {}

@tool
def place_order(quantities: Dict[str, int], session_id: int) -> None:
    """Places a pre-order of snacks.

    Args:
        quantities (Dict[str, int]): Mapping item -> quantity. Keys must exist in menu_prices.
        session_id (int): Client session id.
    """
    global ORDER_BOOK
    assert isinstance(quantities, dict), "Incorrect type for the input dictionary!"
    assert all(k in menu_prices for k in quantities.keys()), f"All food names should be within {list(menu_prices.keys())}"
    ORDER_BOOK[session_id] = quantities

@tool
def get_prices(quantities: Dict[str, int]) -> str:
    """Gets price for certain quantities of ice cream.

    Args:
        quantities (Dict[str, int]): Mapping item -> quantity. Keys must exist in menu_prices.
    Returns:
        str: Human-readable price summary.
    """
    assert isinstance(quantities, dict), "Incorrect type for the input dictionary!"
    assert all(k in menu_prices for k in quantities.keys()), f"All food names should be within {list(menu_prices.keys())}"
    total_price = sum(menu_prices[k] * v for k, v in quantities.items())
    return f"Given the current menu prices:\n{menu_prices}\nThe total price for your order would be: ${total_price:.2f}"

order_agent = CodeAgent(tools=[place_order, get_prices], model=model)
print(order_agent.run(
    "Could I come and collect one crepe nutella?",
    additional_args={"session_id": 192},
    max_steps=2
))

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.12/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 787, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 493, in _make_request
    conn.request(
  File "/usr/local/li

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.12/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 787, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 493, in _make_request
    conn.request(
  File "/usr/local/li

Order for one crepe Nutella has been placed successfully.


## Try multiple orders

In [53]:
from smolagents import tool, ToolCallingAgent
import re, unicodedata

menu_prices = {"crepe nutella": 1.50, "vanilla ice cream": 2.0, "maple pancake": 1.0}
ORDER_BOOK = {}

CANONICAL = {
    "crepe nutella": ["crepe nutella","crêpe nutella","nutella crepe","nutella crêpe","crepe","crêpe"],
    "vanilla ice cream": ["vanilla","vanilla icecream","vanilla ice-cream","ice cream vanilla","vanilla scoop"],
    "maple pancake": ["pancake","pancakes","maple pancake","maple pancakes"]
}

def _strip_accents(s:str)->str:
    return ''.join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))

def _canon_item(s:str):
    s0 = _strip_accents(s).lower().strip()
    for key, syns in CANONICAL.items():
        for pat in syns:
            if _strip_accents(pat).lower() in s0:
                return key
    return None

def parse_order(text:str):
    text = _strip_accents(text).lower()
    # try "2 pancakes", "one crepe", etc.
    qty_word = {"one":1,"two":2,"three":3,"four":4,"five":5,"a":1,"an":1}
    out = {}
    for key, syns in CANONICAL.items():
        for syn in syns:
            syn_n = _strip_accents(syn).lower()
            # number before item
            m = re.search(rf"(\d+|one|two|three|four|five|a|an)\s+{re.escape(syn_n)}", text)
            if m:
                q = int(m.group(1)) if m.group(1).isdigit() else qty_word[m.group(1)]
                out[key] = out.get(key, 0) + q
            # bare mention -> count 1
            elif re.search(rf"\b{re.escape(syn_n)}\b", text):
                out[key] = out.get(key, 0) + 1
    # drop unknown / zero
    return {k:v for k,v in out.items() if k in menu_prices and v>0}

@tool
def place_order_nl(order: str, session_id: int) -> str:
    """Places a pre-order parsed from natural language.
    Args:
        order (str): e.g. "one crepe nutella and 2 pancakes".
        session_id (int): Client session id.
    Returns:
        str: Confirmation and normalized quantities.
    """
    q = parse_order(order)
    if not q:
        return "Could not parse any known items from your order."
    ORDER_BOOK[session_id] = q
    return f"Order placed: {q}"

@tool
def get_prices_nl(order: str) -> str:
    """Gets price for an order described in natural language.
    Args:
        order (str): e.g. "1 crepe nutella + 2 pancakes".
    Returns:
        str: Price breakdown and total.
    """
    q = parse_order(order)
    if not q:
        return "Could not parse any known items."
    lines = [f"- {k}: {v} × ${menu_prices[k]:.2f} = ${menu_prices[k]*v:.2f}" for k,v in q.items()]
    total = sum(menu_prices[k]*v for k,v in q.items())
    return "Price quote:\n" + "\n".join(lines) + f"\nTOTAL=${total:.2f}"

order_agent = ToolCallingAgent(
    tools=[place_order_nl, get_prices_nl],
    model=model,
)
client_requests = [
    ("Could I come and collect one crepe nutella?", "place_order_nl"),
    ("What would be the price for 1 crêpe nutella + 2 pancakes?", "get_prices_nl"),
    ("How did you start your ice-cream business?", None),
    ("What's the weather at the Louvre right now?", None),
    ("I'm not sure if I should order. I want a vanilla ice cream. but if it's more expensive than $1, I don't want it. If it's below, I'll order it, please.", "get_prices_nl"),
]

for req, _ in client_requests:
    prompt = (
      f"{req}\n\nRules:\n"
      "- If an action is needed, you MUST call a tool.\n"
      "- Use get_prices_nl for pricing; use place_order_nl to place orders.\n"
      "- If the user mentions a price condition, first call get_prices_nl, then decide."
    )
    print(order_agent.run(prompt, additional_args={"session_id": 0}, max_steps=2))


Yes, you can come and collect one crepe nutella. Your order has been placed successfully.


$10.00


I started my ice-cream business by conducting market research to understand customer preferences, developing unique flavors, and creating a business plan. I then secured funding, found a suitable location, and began sourcing high-quality ingredients. After setting up the shop and marketing my brand, I officially opened for business.


I cannot provide real-time weather information. Please check a weather website or app for the latest updates.


I will not order the vanilla ice cream as it is more expensive than $1.


In [54]:
!python -m phoenix.server.main serve --host 0.0.0.0 --port 6006 >/dev/null 2>&1 &

import time, requests
for _ in range(30):
    try:
        requests.get("http://127.0.0.1:6006/", timeout=1); break
    except Exception:
        time.sleep(0.5)

from phoenix.otel import register
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
SmolagentsInstrumentor().uninstrument()
tp = register(project_name=PROJECT_NAME, endpoint="http://127.0.0.1:6006/v1/traces")
SmolagentsInstrumentor().instrument(tracer_provider=tp)

import phoenix as px
px.launch_app()



🔭 OpenTelemetry Tracing Details 🔭
|  Phoenix Project: Customer-Success
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: http://127.0.0.1:6006/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.

🌍 To view the Phoenix app in your browser, visit https://0a5uz2skczzg3-496ff2e9c6d22116-6006-colab.googleusercontent.com/
📖 For more information on how to use Phoenix, check out https://arize.com/docs/phoenix


<phoenix.session.session.ThreadSession at 0x7ac203f4f4a0>

## Add processing to extract desired information

In [55]:
spans = px.Client().get_spans_dataframe(project_name=PROJECT_NAME)
spans.head(20)

  df_attributes = pd.DataFrame.from_records(


Unnamed: 0_level_0,name,span_kind,parent_id,start_time,end_time,status_code,status_message,events,context.span_id,context.trace_id,...,attributes.output.mime_type,attributes.llm.invocation_parameters,attributes.llm.token_count.prompt,attributes.llm.model_name,attributes.output.value,attributes.llm.output_messages,attributes.llm.token_count.total,attributes.llm.token_count.completion,attributes.llm.tools,attributes.smolagents
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
e78945e68c56c817,FinalAnswerTool,TOOL,e37d596cc70e7836,2025-09-07 03:22:55.493318+00:00,2025-09-07 03:22:55.493418+00:00,OK,,[],e78945e68c56c817,d95a3b0e59554b52552686333f6fd59e,...,,,,,,,,,,
49dc5a0a8b2d9aab,OpenAIServerModel.generate,LLM,e37d596cc70e7836,2025-09-07 03:22:54.781396+00:00,2025-09-07 03:22:55.484700+00:00,OK,,[],49dc5a0a8b2d9aab,d95a3b0e59554b52552686333f6fd59e,...,application/json,"{""temperature"": 0.2}",1271.0,gpt-4o-mini,"{""role"": ""assistant"", ""content"": null, ""tool_c...","[{'message.role': 'assistant', 'message.tool_c...",1301.0,30.0,"[{'tool.json_schema': '{""type"": ""function"", ""f...",
d7c763e0da188095,SimpleTool,TOOL,e37d596cc70e7836,2025-09-07 03:22:54.766584+00:00,2025-09-07 03:22:54.767090+00:00,OK,,[],d7c763e0da188095,d95a3b0e59554b52552686333f6fd59e,...,text/plain,,,,Price quote:\n- vanilla ice cream: 1 × $2.00 =...,,,,,
89ba4d41b220c9cb,OpenAIServerModel.generate,LLM,e37d596cc70e7836,2025-09-07 03:22:54.167048+00:00,2025-09-07 03:22:54.758005+00:00,OK,,[],89ba4d41b220c9cb,d95a3b0e59554b52552686333f6fd59e,...,application/json,"{""temperature"": 0.2}",1178.0,gpt-4o-mini,"{""role"": ""assistant"", ""content"": null, ""tool_c...","[{'message.role': 'assistant', 'message.tool_c...",1197.0,19.0,"[{'tool.json_schema': '{""type"": ""function"", ""f...",
e37d596cc70e7836,ToolCallingAgent.run,AGENT,,2025-09-07 03:22:54.155041+00:00,2025-09-07 03:22:55.505290+00:00,OK,,[],e37d596cc70e7836,d95a3b0e59554b52552686333f6fd59e,...,,,2449.0,,I will not order the vanilla ice cream as it i...,,2498.0,49.0,,{'task': 'What's the weather at the Louvre rig...
e17a8580e189ddd6,FinalAnswerTool,TOOL,55fac68e88846ba8,2025-09-07 03:22:54.130153+00:00,2025-09-07 03:22:54.130255+00:00,OK,,[],e17a8580e189ddd6,a5d05b0cd82646f34460ed60bf4a2980,...,,,,,,,,,,
2c093752b897f6e7,OpenAIServerModel.generate,LLM,55fac68e88846ba8,2025-09-07 03:22:53.405018+00:00,2025-09-07 03:22:54.122574+00:00,OK,,[],2c093752b897f6e7,a5d05b0cd82646f34460ed60bf4a2980,...,application/json,"{""temperature"": 0.2}",1148.0,gpt-4o-mini,"{""role"": ""assistant"", ""content"": null, ""tool_c...","[{'message.role': 'assistant', 'message.tool_c...",1181.0,33.0,"[{'tool.json_schema': '{""type"": ""function"", ""f...",
55fac68e88846ba8,ToolCallingAgent.run,AGENT,,2025-09-07 03:22:53.390139+00:00,2025-09-07 03:22:54.145056+00:00,OK,,[],55fac68e88846ba8,a5d05b0cd82646f34460ed60bf4a2980,...,,,1148.0,,I cannot provide real-time weather information...,,1181.0,33.0,,{'task': 'How did you start your ice-cream bus...
a52dfc583fc22884,FinalAnswerTool,TOOL,2d2180e730b768af,2025-09-07 03:22:53.367885+00:00,2025-09-07 03:22:53.367988+00:00,OK,,[],a52dfc583fc22884,3d38b66090551641f9d1877876776f9e,...,,,,,,,,,,
34306bbc3ff71eca,OpenAIServerModel.generate,LLM,2d2180e730b768af,2025-09-07 03:22:52.380136+00:00,2025-09-07 03:22:53.359118+00:00,OK,,[],34306bbc3ff71eca,3d38b66090551641f9d1877876776f9e,...,application/json,"{""temperature"": 0.2}",1149.0,gpt-4o-mini,"{""role"": ""assistant"", ""content"": null, ""tool_c...","[{'message.role': 'assistant', 'message.tool_c...",1221.0,72.0,"[{'tool.json_schema': '{""type"": ""function"", ""f...",


## Now, compare tool calls with exected tool calls

In [56]:
import pandas as pd, json, re

def find_col(df, options):
    for c in options:
        if c in df.columns: return c
    return None

if spans.empty:
    raise RuntimeError("No spans found. Start Phoenix, re-run the agent, then re-run this cell.")

# ---- column detection ----
kind_col   = find_col(spans, ["span_kind","kind"])
trace_col  = find_col(spans, ["context.trace_id","trace_id"])
name_col   = find_col(spans, ["name"])
input_cols = [c for c in ["attributes.input.value","attributes.input","attributes.input_text","input"] if c in spans.columns]
start_col  = find_col(spans, ["start_time","start"])

tool_name_col = find_col(spans, ["attributes.tool.name","attributes.tool_name","tool.name","tool_name"])

def get_input_val(row):
    for c in input_cols:
        v = row.get(c)
        if pd.notna(v):
            return v
    return None

# ---- identify AGENT spans & extract 'task' ----
is_agent = spans[kind_col].astype(str).str.upper().str.contains("AGENT", na=False)
agents = spans.loc[is_agent].copy()

def extract_task(val):
    if isinstance(val, str):
        try:
            obj = json.loads(val)
            if isinstance(obj, dict):
                return obj.get("task") or obj.get("input") or obj.get("prompt")
        except Exception:
            pass
        return val  # plain text fallback
    if isinstance(val, dict):
        return val.get("task") or val.get("input") or val.get("prompt")
    return None

agents["task"] = agents.apply(lambda r: extract_task(get_input_val(r)), axis=1)

# ---- identify TOOL-like spans ----
k = spans[kind_col].astype(str).str.upper()
tool_mask = k.str.contains("TOOL|FUNCTION|ACTION", regex=True, na=False)
tools = spans.loc[tool_mask, [c for c in [trace_col, name_col, tool_name_col] if c]].copy()

# normalize tool name: prefer explicit column else span name
if tool_name_col and tool_name_col in tools.columns:
    tools["tool_name"] = tools[tool_name_col]
else:
    tools["tool_name"] = tools[name_col]

def normalize_tool(s):
    if not isinstance(s, str): return s
    s = s.strip()
    s = re.sub(r"^(Tool:|TOOL:)\s*", "", s)
    return s.lower()

tools["tool_name"] = tools["tool_name"].apply(normalize_tool)

# ---- map each client request to a trace_id ----
# 1) try exact/substring match on the extracted agent task
req_to_trace = {}
agents_sorted = agents.sort_values(start_col or "start_time")
for req, _ in client_requests:
    # exact then substring match
    m = agents_sorted.loc[agents_sorted["task"] == req, trace_col]
    if m.empty:
        m = agents_sorted.loc[agents_sorted["task"].astype(str).str.contains(re.escape(req[:40]), na=False), trace_col]
    if not m.empty:
        req_to_trace[req] = m.iloc[-1]

# 2) fallback: align by order if counts mismatch (take most recent N agent spans)
if len(req_to_trace) < len(client_requests) and start_col:
    recent_traces = agents_sorted[trace_col].tail(len(client_requests)).tolist()
    for (req, _), t in zip(client_requests, recent_traces):
        req_to_trace.setdefault(req, t)

# ---- aggregate tool calls per request ----
rows = []
for req, expected_tool in client_requests:
    t_id = req_to_trace.get(req)
    called = set()
    if t_id is not None:
        called = set(tools.loc[tools[trace_col] == t_id, "tool_name"].dropna().tolist())
    # scoring rule (adjust if you require a tool even for chit-chat):
    is_ok = (called == {"final_answer"} or not called) if expected_tool is None else (expected_tool in called)
    rows.append({"request": req, "tool_calls_performed": called, "is_correct": is_ok})

pd.DataFrame(rows)


Unnamed: 0,request,tool_calls_performed,is_correct
0,Could I come and collect one crepe nutella?,"{get_prices_nl, place_order_nl}",True
1,What would be the price for 1 crêpe nutella + ...,"{get_prices_nl, final_answer}",True
2,How did you start your ice-cream business?,{final_answer},True
3,What's the weather at the Louvre right now?,{final_answer},True
4,I'm not sure if I should order. I want a vanil...,"{get_prices_nl, final_answer}",True
