# Phoenix Tracing Tutorial - Support Agent (Python)

This notebook follows along with the Phoenix Tracing Tutorial documentation:
- [Chapter 1: Your First Traces](/docs/phoenix/tracing/tutorial/your-first-traces)
- [Chapter 2: Annotations and Evaluation](/docs/phoenix/tracing/tutorial/annotations-and-evaluations)
- [Chapter 3: Sessions](/docs/phoenix/tracing/tutorial/sessions)


# Chapter 1: Your First Traces

## Setting Up Tracing

### Install Dependencies


In [None]:
!pip install arize-phoenix-otel arize-phoenix-client openai openinference-instrumentation-openai openinference-instrumentation numpy

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "your-openai-api-key"
os.environ["PHOENIX_API_KEY"] = "your-phoenix-api-key"
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "your-phoenix-collector-endpoint"

### Configure Tracing


In [None]:
from phoenix.otel import register

tracer_provider = register(project_name="support-bot", auto_instrument=True)

## Tracing LLM Calls

Import libraries and set up the OpenAI client.


In [None]:
import json
import re
import uuid
from typing import Any, Dict, List, Literal, Optional, TypedDict

from openai import OpenAI
from opentelemetry import trace

from phoenix.client import Client

client = OpenAI()
phoenix_client = Client()

# Get a tracer for creating custom spans
tracer = trace.get_tracer("support-agent")

This is a simple LLM call with tracing. All OpenAI calls are automatically traced

In [None]:
user_query = "Where is my order?"

result = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "Classify the query as 'order_status' or 'faq'"},
        {"role": "user", "content": user_query},
    ],
)
print("\n‚úÖ This LLM call is automatically traced! Check Phoenix UI to see the span.")

## Tracing Tool Calls

Set up mock data for the support agent to access.


In [None]:
class Message(TypedDict):
    role: Literal["user", "assistant"]
    content: str


# Order Database (for tool calls)
ORDER_DATABASE: Dict[str, Dict[str, str]] = {
    "ORD-12345": {
        "status": "shipped",
        "carrier": "FedEx",
        "trackingNumber": "1234567890",
        "eta": "December 11, 2025",
    },
    "ORD-67890": {
        "status": "processing",
        "carrier": "pending",
        "trackingNumber": "pending",
        "eta": "December 15, 2025",
    },
    "ORD-11111": {
        "status": "delivered",
        "carrier": "UPS",
        "trackingNumber": "9876543210",
        "eta": "Delivered December 5, 2025",
    },
}


# FAQ Database (for RAG)
class FAQEntry(TypedDict):
    id: int
    question: str
    answer: str
    category: str
    embedding: Optional[List[float]]


FAQ_DATABASE: List[FAQEntry] = [
    {
        "id": 1,
        "question": "How do I reset my password?",
        "answer": "Go to Settings > Security > Reset Password. You'll receive an email with a reset link that expires in 24 hours.",
        "category": "Account",
        "embedding": None,
    },
    {
        "id": 2,
        "question": "What's your refund policy?",
        "answer": "We offer full refunds within 30 days of purchase for unused items. Contact support with your order number to initiate a refund.",
        "category": "Billing",
        "embedding": None,
    },
    {
        "id": 3,
        "question": "How do I cancel my subscription?",
        "answer": "Go to Account Settings > Subscription > Cancel Subscription. Your access continues until the end of the current billing period.",
        "category": "Billing",
        "embedding": None,
    },
    {
        "id": 4,
        "question": "What payment methods do you accept?",
        "answer": "We accept Visa, Mastercard, American Express, PayPal, and Apple Pay. All transactions are securely processed.",
        "category": "Billing",
        "embedding": None,
    },
    {
        "id": 5,
        "question": "How do I update my profile information?",
        "answer": "Go to Account Settings > Profile. You can update your name, email, phone number, and address there.",
        "category": "Account",
        "embedding": None,
    },
]

QueryCategory = Literal["order_status", "faq"]


class ClassificationResult(TypedDict):
    category: QueryCategory
    confidence: str
    reasoning: str


class AgentResponse(TypedDict):
    query: str
    response: str
    spanId: str
    category: QueryCategory
    sessionId: Optional[str]


class SessionContext(TypedDict):
    lastMentionedOrderId: Optional[str]
    turnCount: int

Example query with tool calling. Tools allow your agent to interact with databases, APIs, and external systems.

In [None]:
from openinference.semconv.trace import SpanAttributes

tools = [
    {
        "type": "function",
        "function": {
            "name": "lookupOrderStatus",
            "description": "Look up the current status of a customer order by order ID",
            "parameters": {
                "type": "object",
                "properties": {
                    "orderId": {
                        "type": "string",
                        "description": "The order ID to look up (e.g., ORD-12345)",
                    }
                },
                "required": ["orderId"],
            },
        },
    }
]


# Helper function to execute tools automatically
def execute_tool_call(tool_call, database):
    """Execute a tool call and return the result."""
    function_name = tool_call.function.name
    function_args = json.loads(tool_call.function.arguments)

    with tracer.start_as_current_span(
        function_name,
        attributes={
            SpanAttributes.OPENINFERENCE_SPAN_KIND: "TOOL",
            SpanAttributes.TOOL_NAME: function_name,
            SpanAttributes.TOOL_PARAMETERS: json.dumps(function_args),
            SpanAttributes.INPUT_VALUE: json.dumps(function_args),
        },
    ) as tool_span:
        if function_name == "lookupOrderStatus":
            order_id = function_args.get("orderId")
            result = database.get(order_id, {"error": f"Order {order_id} not found"})
        else:
            result = {"error": f"Unknown tool: {function_name}"}

        tool_span.set_attribute(SpanAttributes.OUTPUT_VALUE, json.dumps(result))
        tool_span.set_status(trace.Status(trace.StatusCode.OK))
        return result


user_query = "What is the status of ORD-12345?"

# Create a parent span to group all spans
with tracer.start_as_current_span(
    "tool-call-example",
    attributes={
        SpanAttributes.OPENINFERENCE_SPAN_KIND: "CHAIN",
        SpanAttributes.INPUT_VALUE: user_query,
    },
) as parent_span:
    messages = [
        {
            "role": "system",
            "content": "You are a helpful customer support agent. When customers ask about order status, use the lookupOrderStatus tool to get the information.",
        },
        {"role": "user", "content": user_query},
    ]

    result = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        tools=tools,
        tool_choice="auto",
    )

    message = result.choices[0].message
    messages.append(message)

    # Execute tool if called, then get final response
    if message.tool_calls:
        for tool_call in message.tool_calls:
            tool_result = execute_tool_call(tool_call, ORDER_DATABASE)
            messages.append(
                {
                    "role": "tool",
                    "tool_call_id": tool_call.id,
                    "content": json.dumps(tool_result),
                }
            )

        # Final LLM call with tool result
        final_result = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
        )
        final_response = final_result.choices[0].message.content
    else:
        final_response = message.content

    parent_span.set_attribute(SpanAttributes.OUTPUT_VALUE, final_response)
    parent_span.set_status(trace.Status(trace.StatusCode.OK))
    print(f"Query: {user_query}")
print(f"Response: {final_response}")
print("‚úÖ Check Phoenix UI to see the full trace")

## Tracing RAG Pipelines

Helper functions for embeddings and similarity search.


In [None]:
import numpy as np


def cosine_similarity(a: List[float], b: List[float]) -> float:
    """Calculate cosine similarity between two vectors."""
    a_array = np.array(a)
    b_array = np.array(b)
    dot_product = np.dot(a_array, b_array)
    magnitude_a = np.linalg.norm(a_array)
    magnitude_b = np.linalg.norm(b_array)
    return dot_product / (magnitude_a * magnitude_b)


def initialize_faq_embeddings() -> None:
    print("üìö Initializing FAQ embeddings...")

    for faq in FAQ_DATABASE:
        response = client.embeddings.create(model="text-embedding-ada-002", input=faq["question"])
        faq["embedding"] = response.data[0].embedding

    print("‚úÖ FAQ embeddings initialized")

Example RAG pipeline with tracing:

In [None]:
# First, initialize embeddings (only need to do this once)
initialize_faq_embeddings()

In [None]:
user_query = "How do I reset my password?"

with tracer.start_as_current_span(
    "rag-example",
    attributes={
        SpanAttributes.OPENINFERENCE_SPAN_KIND: "CHAIN",
        SpanAttributes.INPUT_VALUE: user_query,
    },
) as parent_span:
    # Step 1: Embed the query (automatically traced)
    embedding_response = client.embeddings.create(model="text-embedding-ada-002", input=user_query)
    query_embedding = embedding_response.data[0].embedding

    # Step 2: Find relevant FAQs using cosine similarity
    faq_scores = []
    for faq in FAQ_DATABASE:
        if faq["embedding"]:
            score = cosine_similarity(query_embedding, faq["embedding"])
            faq_scores.append((faq, score))

    relevant_faqs = sorted(faq_scores, key=lambda x: x[1], reverse=True)[:2]

    with tracer.start_as_current_span(
        "faq-retrieval",
        attributes={
            SpanAttributes.OPENINFERENCE_SPAN_KIND: "RETRIEVER",
            SpanAttributes.INPUT_VALUE: user_query,
        },
    ) as retrieval_span:
        for i, (faq, score) in enumerate(relevant_faqs):
            retrieval_span.set_attribute(f"retrieval.documents.{i}.document.id", str(faq["id"]))
            retrieval_span.set_attribute(
                f"retrieval.documents.{i}.document.content",
                f"Q: {faq['question']}\nA: {faq['answer']}",
            )
            retrieval_span.set_attribute(
                f"retrieval.documents.{i}.document.metadata",
                json.dumps({"category": faq["category"], "score": score}),
            )

        retrieval_span.set_status(trace.Status(trace.StatusCode.OK))

    # Step 3: Build context from retrieved FAQs
    rag_context = "\n\n".join(
        [f"Q: {faq['question']}\nA: {faq['answer']}" for faq, _ in relevant_faqs]
    )

    # Step 4: Generate answer with retrieved context (automatically traced)
    rag_result = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": f"You are a helpful customer support agent. Answer the user's question using ONLY the information provided in the context below. Be friendly and concise.\n\nContext:\n{rag_context}",
            },
            {"role": "user", "content": user_query},
        ],
    )

    final_response = rag_result.choices[0].message.content
    parent_span.set_attribute(SpanAttributes.OUTPUT_VALUE, final_response)
    parent_span.set_status(trace.Status(trace.StatusCode.OK))
    print(f"Query: {user_query}")
print(f"Response: {final_response}")
print("\n‚úÖ All RAG operations are traced!")

## Grouping Operations with Parent Spans

The complete support agent that wraps all operations in a parent span.


In [None]:
from openinference.instrumentation import using_session
from opentelemetry.trace import format_span_id


def handle_support_query(
    user_query: str,
    session_id: Optional[str] = None,
    conversation_history: List[Message] = None,
    session_context: SessionContext = None,
) -> AgentResponse:
    """
    Handle a support query with optional session tracking.

    Args:
        user_query: The user's question
        session_id: Optional session ID for multi-turn conversations
        conversation_history: Previous messages in the conversation
        session_context: Context from previous turns
    """
    if conversation_history is None:
        conversation_history = []
    if session_context is None:
        session_context = {"lastMentionedOrderId": None, "turnCount": 0}

    def run_agent() -> AgentResponse:
        with tracer.start_as_current_span(
            "support-agent",
            attributes={
                SpanAttributes.OPENINFERENCE_SPAN_KIND: "AGENT",
                SpanAttributes.INPUT_VALUE: user_query,
                **({SpanAttributes.SESSION_ID: session_id} if session_id else {}),
                "conversation.turn": session_context["turnCount"] + 1,
            },
        ) as agent_span:
            # Capture the span ID
            span_id = format_span_id(agent_span.get_span_context().span_id)
            category: QueryCategory = "faq"

            try:
                print("\n" + "=" * 60)
                print("ü§ñ Support Agent Processing Query")
                print("=" * 60)
                print(f'üì® Query: "{user_query}"')
                print(f"   Span ID: {span_id}")
                if session_id:
                    print(f"   Session ID: {session_id}")
                    print(f"   Turn: {session_context['turnCount'] + 1}")

                # Build conversation context for multi-turn support
                conversation_context = (
                    "\n\nPrevious conversation:\n"
                    + "\n".join([f"{m['role']}: {m['content']}" for m in conversation_history])
                    if conversation_history
                    else ""
                )

                # Check if we have a remembered order ID from previous turns
                remembered_order_info = (
                    f"\nNote: The customer previously mentioned order {session_context['lastMentionedOrderId']}."
                    if session_context["lastMentionedOrderId"]
                    else ""
                )

                # Step 1: Classify the query
                print("\nüìã Step 1: Classifying query...")

                classification_prompt = f"""You are a support query classifier. Classify the user's query into one of these categories:

1. "order_status" - Questions about order tracking, delivery status, shipping, where is my order, tracking numbers, ETAs
2. "faq" - General questions about accounts, billing, refunds, passwords, subscriptions, payment methods
{remembered_order_info}
Respond with JSON only:
{{
  "category": "order_status" or "faq",
  "confidence": "high" or "medium" or "low",
  "reasoning": "brief explanation"
}}"""

                classification_response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[
                        {"role": "system", "content": classification_prompt},
                        {"role": "user", "content": user_query + conversation_context},
                    ],
                    response_format={"type": "json_object"},
                )

                classification_text = classification_response.choices[0].message.content

                try:
                    classification: ClassificationResult = json.loads(classification_text)
                except json.JSONDecodeError:
                    # Default to FAQ if parsing fails
                    classification = {
                        "category": "faq",
                        "confidence": "low",
                        "reasoning": "Failed to parse classification",
                    }

                print(f"   Category: {classification['category']}")
                print(f"   Reasoning: {classification['reasoning']}")

                category = classification["category"]
                agent_span.set_attribute("classification.category", classification["category"])
                agent_span.set_attribute("classification.confidence", classification["confidence"])

                # Step 2: Route based on classification
                if classification["category"] == "order_status":
                    print("\nüîß Step 2: Deciding whether to use tool...")

                    order_prompt = (
                        f'{user_query}\n\nNote: Earlier in this conversation, the customer mentioned order {session_context["lastMentionedOrderId"]}. If they\'re asking about "that order" or similar, use this order ID.'
                        if session_context["lastMentionedOrderId"]
                        else user_query
                    )

                    # Define the tool for order lookup
                    tools = [
                        {
                            "type": "function",
                            "function": {
                                "name": "lookupOrderStatus",
                                "description": "Look up the current status of a customer order by order ID",
                                "parameters": {
                                    "type": "object",
                                    "properties": {
                                        "orderId": {
                                            "type": "string",
                                            "description": "The order ID to look up (e.g., ORD-12345)",
                                        }
                                    },
                                    "required": ["orderId"],
                                },
                            },
                        }
                    ]

                    tool_decision = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[
                            {
                                "role": "system",
                                "content": "You are a helpful customer support agent. When customers ask about order status, use the lookupOrderStatus tool to get the information. If no order ID is mentioned and none was mentioned earlier, ask for it politely. Always use the tool when an order ID is provided or referenced.",
                            },
                            {"role": "user", "content": order_prompt},
                        ],
                        tools=tools,
                        tool_choice="auto",
                    )

                    # Check if tool was called
                    message = tool_decision.choices[0].message
                    order_info: Optional[Dict[str, Any]] = None

                    if message.tool_calls:
                        tool_call = message.tool_calls[0]
                        function_name = tool_call.function.name
                        function_args = json.loads(tool_call.function.arguments)
                        order_id = function_args.get("orderId")

                        print(f"   üîß Tool called: {function_name}({order_id})")

                        # Create a span for the tool call
                        with tracer.start_as_current_span(
                            function_name,
                            attributes={
                                SpanAttributes.OPENINFERENCE_SPAN_KIND: "TOOL",
                                SpanAttributes.TOOL_NAME: function_name,
                                SpanAttributes.TOOL_PARAMETERS: json.dumps(function_args),
                                SpanAttributes.INPUT_VALUE: json.dumps(function_args),
                            },
                        ) as tool_span:
                            order = ORDER_DATABASE.get(order_id)
                            if not order:
                                order_info = {"error": f"Order {order_id} not found in our system"}
                                tool_span.set_attribute(
                                    SpanAttributes.OUTPUT_VALUE, json.dumps(order_info)
                                )
                                tool_span.set_status(
                                    trace.Status(trace.StatusCode.ERROR, "Order not found")
                                )
                            else:
                                print(f"   ‚úÖ Order found: {json.dumps(order)}")
                                order_info = {"orderId": order_id, **order}
                                tool_span.set_attribute(
                                    SpanAttributes.OUTPUT_VALUE, json.dumps(order_info)
                                )
                                tool_span.set_status(trace.Status(trace.StatusCode.OK))

                    if order_info and "error" not in order_info:
                        print(f"   üì¶ Order info for response: {json.dumps(order_info)}")
                        print("\nüí¨ Step 3: Generating response from tool result...")

                        final_response = client.chat.completions.create(
                            model="gpt-4o-mini",
                            messages=[
                                {
                                    "role": "system",
                                    "content": "You are a helpful customer support agent. Summarize order information in a friendly way. Use the exact data provided - do not make up information.",
                                },
                                {
                                    "role": "user",
                                    "content": f"""Customer asked: "{user_query}"

Here is the order information I found:
- Order ID: {order_info["orderId"]}
- Status: {order_info["status"]}
- Carrier: {order_info["carrier"]}
- Tracking Number: {order_info["trackingNumber"]}
- Estimated Arrival: {order_info["eta"]}

Write a friendly 2-3 sentence response sharing this information with the customer.""",
                                },
                            ],
                        )

                        response = final_response.choices[0].message.content
                    else:
                        response = (
                            message.content
                            or "I'd be happy to help you with your order status. Could you please provide your order ID? It should look like ORD-XXXXX."
                        )

                else:
                    # Handle FAQ with RAG
                    print("\nüìö Step 2: Searching knowledge base (RAG)...")

                    # Embed the query
                    embedding_response = client.embeddings.create(
                        model="text-embedding-ada-002", input=user_query
                    )
                    query_embedding = embedding_response.data[0].embedding

                    # Find relevant FAQs
                    faq_scores = []
                    for faq in FAQ_DATABASE:
                        if faq["embedding"]:
                            score = cosine_similarity(query_embedding, faq["embedding"])
                            faq_scores.append((faq, score))

                    relevant_faqs = sorted(faq_scores, key=lambda x: x[1], reverse=True)[:2]

                    print("   Found relevant FAQs")

                    # Create a retrieval span to track the retrieval operation
                    with tracer.start_as_current_span(
                        "faq-retrieval",
                        attributes={
                            SpanAttributes.OPENINFERENCE_SPAN_KIND: "RETRIEVER",
                            SpanAttributes.INPUT_VALUE: user_query,
                        },
                    ) as retrieval_span:
                        for i, (faq, score) in enumerate(relevant_faqs):
                            retrieval_span.set_attribute(
                                f"retrieval.documents.{i}.document.id", str(faq["id"])
                            )
                            retrieval_span.set_attribute(
                                f"retrieval.documents.{i}.document.content",
                                f"Q: {faq['question']}\nA: {faq['answer']}",
                            )
                            metadata_str = json.dumps(
                                {"category": faq["category"], "score": float(score)}
                            )
                            retrieval_span.set_attribute(
                                f"retrieval.documents.{i}.document.metadata", metadata_str
                            )

                        retrieval_span.set_status(trace.Status(trace.StatusCode.OK))

                    # Build context
                    rag_context = "\n\n".join(
                        [f"Q: {faq['question']}\nA: {faq['answer']}" for faq, _ in relevant_faqs]
                    )

                    # Generate answer
                    print("\nüí¨ Step 3: Generating response...")

                    rag_result = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[
                            {
                                "role": "system",
                                "content": f"You are a helpful customer support agent. Answer the user's question using ONLY the information provided in the context below. Be friendly and concise.\n\nContext:\n{rag_context}",
                            },
                            {"role": "user", "content": user_query},
                        ],
                    )

                    response = rag_result.choices[0].message.content

                print(f"\nüì§ Response: {response}")
                print("=" * 60)

                agent_span.set_attribute(SpanAttributes.OUTPUT_VALUE, response)
                agent_span.set_status(trace.Status(trace.StatusCode.OK))

                return {
                    "query": user_query,
                    "response": response,
                    "spanId": span_id,
                    "category": category,
                    "sessionId": session_id,
                }
            except Exception as error:
                agent_span.set_status(trace.Status(trace.StatusCode.ERROR, str(error)))
                raise

    # If we have a session ID, propagate it to all child spans
    if session_id:
        with using_session(session_id):
            return run_agent()

    return run_agent()

In [None]:
# First, make sure FAQ embeddings are initialized
initialize_faq_embeddings()

queries = [
    "What's the status of order ORD-12345?",
    "How can I get a refund?",
    "Where is my order ORD-67890?",
    "I forgot my password",
    "What's the status of order ORD-99999?",
    "How do I upgrade to premium?",
    "Can you help me with something?",
]

print("=" * 60)
print("Running Support Agent with Test Queries")
print("=" * 60)

for query in queries:
    result = handle_support_query(query)
    print(f"\n‚úÖ Query processed: {result['category']}")
    print("-" * 60)

print("\n‚úÖ All queries processed! Check Phoenix UI to see the traces.")

# Chapter 2: Annotations and Evaluation

## Programmatic Annotations (User Feedback)

### Get the Span ID from Running Code

The support agent already captures span IDs. Now we'll collect user feedback and log it to Phoenix.


In [None]:
from phoenix.client.resources.spans import SpanAnnotationData


def collect_user_feedback(responses: List[AgentResponse]) -> None:
    """
    Collect interactive feedback from the user for each response.
    Shows the query and response, then asks for thumbs up/down.
    """
    print("\n" + "=" * 60)
    print("üëçüëé User Feedback Collection")
    print("=" * 60)
    print("\nFor each response, enter:")
    print("  'y' or '1' = üëç thumbs-up (good response)")
    print("  'n' or '0' = üëé thumbs-down (bad response)")
    print("  's' = skip (no feedback)")
    print("")

    annotations: List[SpanAnnotationData] = []

    for i, resp in enumerate(responses):
        print(f"Response {i + 1} of {len(responses)}")
        print(f'Query: "{resp["query"]}"')
        print(f'Response: "{resp["response"]}"')

        answer = input("Was this response helpful? (y/n/s): ").strip().lower()

        if answer in ["y", "1", "yes"]:
            print("   ‚Üí üëç Recorded as thumbs-up\n")
            annotations.append(
                SpanAnnotationData(
                    name="user_feedback",
                    span_id=resp["spanId"],
                    annotator_kind="HUMAN",
                    result={"label": "thumbs-up", "score": 1.0},
                    metadata={"category": resp["category"], "source": "interactive_tutorial"},
                )
            )
        elif answer in ["n", "0", "no"]:
            print("   ‚Üí üëé Recorded as thumbs-down\n")
            annotations.append(
                SpanAnnotationData(
                    name="user_feedback",
                    span_id=resp["spanId"],
                    annotator_kind="HUMAN",
                    result={"label": "thumbs-down", "score": 0.0},
                    metadata={"category": resp["category"], "source": "interactive_tutorial"},
                )
            )
        else:
            print("   ‚Üí ‚è≠Ô∏è  Skipped\n")

    if annotations:
        print("-" * 60)

        try:
            phoenix_client.spans.log_span_annotations(
                span_annotations=annotations,
                sync=False,
            )
            print(f"‚úÖ Logged {len(annotations)} feedback annotations to Phoenix")
        except Exception as error:
            print(f"Failed to log feedback: {error}")

Example: Collect feedback on support agent responses


In [None]:
queries = [
    "What's the status of order ORD-12345?",
    "How can I get a refund?",
    "I forgot my password",
]

print("=" * 60)
print("Running Support Agent Queries")
print("=" * 60)

responses = []
for query in queries:
    result = handle_support_query(query)
    responses.append(result)

collect_user_feedback(responses)

## LLM-as-a-Judge Evaluations

### Install the Phoenix Evals Package

In [None]:
!pip install arize-phoenix-evals

### Tool Result Evaluator

Did the tool call succeed or return an error? This is a simple code-based check:

In [None]:
# Fetch spans from Phoenix
spans = phoenix_client.spans.get_spans(
    project_identifier="support-bot",
    limit=200,
)

# Filter for tool spans (lookupOrderStatus)
tool_spans = [span for span in spans if span.get("name") == "lookupOrderStatus"]

print(f"Found {len(tool_spans)} tool spans")

# Tool Result Evaluator - code-based check
tool_annotations = []

for span in tool_spans:
    # Access span_id from context
    context = span.get("context", {})
    span_id = context.get("span_id", "") if isinstance(context, dict) else ""

    # Access attributes (may be a dict or JSON string)
    attributes = span.get("attributes", {})
    if isinstance(attributes, str):
        attributes = json.loads(attributes)

    output_value = attributes.get("output.value", "")

    # Simple check: does the output contain "error" or "not found"?
    output_str = json.dumps(output_value) if not isinstance(output_value, str) else output_value
    has_error = "error" in output_str.lower() or "not found" in output_str.lower()

    status = "‚ùå ERROR" if has_error else "‚úÖ SUCCESS"

    tool_annotations.append(
        SpanAnnotationData(
            name="tool_result",
            span_id=span_id,
            annotator_kind="CODE",
            result={
                "label": "error" if has_error else "success",
                "score": 0.0 if has_error else 1.0,
            },
            metadata={
                "evaluator": "tool_result",
                "type": "code",
            },
        )
    )

print(f"\n‚úÖ Evaluated {len(tool_annotations)} tool spans")

### Retrieval Relevance Evaluator

Was the retrieved context actually relevant to the question?

In [None]:
from phoenix.evals import LLM, create_classifier

# Filter for retrieval spans (RETRIEVER kind) - FAQ retrieval
retrieval_spans = [
    span
    for span in spans
    if span.get("span_kind") == "RETRIEVER" or span.get("name") == "faq-retrieval"
]

print(f"Found {len(retrieval_spans)} FAQ retrieval spans")

# Create an LLM-as-Judge evaluator that determines if retrieved context was relevant
llm = LLM(provider="openai", model="gpt-5")

retrieval_relevance_evaluator = create_classifier(
    name="retrieval_relevance",
    prompt_template="""You are evaluating whether the retrieved context is relevant to answering the user's prompt.

Classify the retrieval as:
- RELEVANT: The context contains information that directly helps answer the question
- IRRELEVANT: The context does NOT contain useful information for the question

You are comparing the "Context" object and the "prompt" object.

[Context and Prompt]: {input}""",
    llm=llm,
    choices={"relevant": 1, "irrelevant": 0},
)

# Evaluate each retrieval span
rag_annotations = []

for span in retrieval_spans:
    # Access span_id from context
    context = span.get("context", {})
    span_id = context.get("span_id", "") if isinstance(context, dict) else ""

    # Access attributes (may be a dict or JSON string)
    attributes = span.get("attributes", {})
    if isinstance(attributes, str):
        attributes = json.loads(attributes)

    # Extract the query and retrieved documents
    query = attributes.get("input.value", "")

    # Extract retrieved documents
    documents = []
    i = 0
    while f"retrieval.documents.{i}.document.content" in attributes:
        doc_content = attributes.get(f"retrieval.documents.{i}.document.content", "")
        documents.append(doc_content)
        i += 1

    if not query or not documents:
        span_id_short = span_id[:8] if span_id else "unknown"
        print(f"   Skipping span {span_id_short} - missing query or documents")
        continue

    # Build input for evaluator: query + retrieved context
    context_text = "\n\n".join(documents)
    evaluation_input = f"Query: {query}\n\nRetrieved Context:\n{context_text}"

    try:
        result = retrieval_relevance_evaluator.evaluate({"input": evaluation_input})
        score_result = result[0] if isinstance(result, list) else result

        status = "‚úÖ RELEVANT" if score_result.label == "relevant" else "‚ùå IRRELEVANT"
        span_id_short = span_id[:8] if span_id else "unknown"
        print(f"   Retrieval span {span_id_short}... {status}")

        rag_annotations.append(
            SpanAnnotationData(
                name="retrieval_relevance",
                span_id=span_id,
                annotator_kind="LLM",
                result={
                    "label": score_result.label,
                    "score": score_result.score
                    if hasattr(score_result, "score")
                    else (1.0 if score_result.label == "relevant" else 0.0),
                },
                metadata={
                    "model": "gpt-5",
                    "evaluator": "retrieval_relevance",
                },
            )
        )
    except Exception as e:
        span_id_short = span_id[:8] if span_id else "unknown"
        print(f"   Error evaluating span {span_id_short}: {e}")

print(f"\n‚úÖ Evaluated {len(rag_annotations)} retrieval spans")

### Log Evaluations to Phoenix

Log all evaluation annotations to Phoenix:

In [None]:
all_eval_annotations = tool_annotations + rag_annotations

if all_eval_annotations:
    print(f"\nüì§ Logging {len(all_eval_annotations)} evaluation annotations to Phoenix...")

    try:
        phoenix_client.spans.log_span_annotations(
            span_annotations=all_eval_annotations,
            sync=False,
        )
        print(f"‚úÖ Logged {len(all_eval_annotations)} evaluation annotations")
        print(f"   - {len(tool_annotations)} tool_result annotations")
        print(f"   - {len(rag_annotations)} retrieval_relevance annotations")
    except Exception as error:
        print(f"‚ùå Failed to log evaluations: {error}")
else:
    print("\n‚ö†Ô∏è  No annotations to log. Make sure you've run the support agent first.")

# Chapter 3: Sessions

## Setting Up Sessions

### Add Session Tracking to Your Agent

The support agent already supports sessions. Here's how to run multi-turn conversations.


In [None]:
class ConversationTurn(TypedDict):
    userMessage: str
    expectedBehavior: str


class ConversationScenario(TypedDict):
    name: str
    description: str
    turns: List[ConversationTurn]


def run_multi_turn_conversation(scenario: ConversationScenario) -> Dict[str, Any]:
    """
    Run a multi-turn conversation with session tracking.
    Each conversation gets a unique session ID, and all turns are linked together.
    """
    session_id = str(uuid.uuid4())
    responses: List[AgentResponse] = []
    conversation_history: List[Message] = []
    session_context: SessionContext = {"lastMentionedOrderId": None, "turnCount": 0}

    print("\n" + "=" * 60)
    print(f"üó£Ô∏è  Conversation: {scenario['name']}")
    print(f"üìù {scenario['description']}")
    print(f"üîë Session ID: {session_id}")
    print("=" * 60)

    for turn in scenario["turns"]:
        print(f'\nüí¨ Turn {session_context["turnCount"] + 1}: "{turn["userMessage"]}"')
        print(f"   Expected: {turn['expectedBehavior']}")

        # Run the agent with session context
        result = handle_support_query(
            turn["userMessage"], session_id, conversation_history, session_context
        )

        responses.append(result)

        # Update conversation history for next turn
        conversation_history.append({"role": "user", "content": turn["userMessage"]})
        conversation_history.append({"role": "assistant", "content": result["response"]})

        # Update session context - extract order ID if mentioned
        order_id_match = re.search(r"ORD-\d+", turn["userMessage"], re.IGNORECASE)
        if not order_id_match:
            order_id_match = re.search(r"ORD-\d+", result["response"], re.IGNORECASE)
        if order_id_match:
            session_context["lastMentionedOrderId"] = order_id_match.group(0).upper()
        session_context["turnCount"] += 1

    print("\n" + "-" * 60)
    print(f"‚úÖ Conversation complete: {len(scenario['turns'])} turns")
    print("-" * 60)

    return {"sessionId": session_id, "responses": responses}


def run_sessions_demo() -> None:
    """Run the multi-turn sessions demo with several conversation scenarios."""
    print("=" * 60)
    print("Phoenix Tracing Tutorial - Sessions Demo")
    print("=" * 60)
    print("\nThis demo shows multi-turn conversations tracked as sessions.")
    print("Each conversation has a unique session ID that links all turns together.")
    print("View them in Phoenix UI under the 'Sessions' tab.\n")

    # Initialize FAQ embeddings first
    initialize_faq_embeddings()

    # Define conversation scenarios
    scenarios: List[ConversationScenario] = [
        {
            "name": "Order Inquiry - Successful Resolution",
            "description": "Customer asks about order, gets status, asks follow-up",
            "turns": [
                {
                    "userMessage": "What's the status of order ORD-12345?",
                    "expectedBehavior": "Tool call ‚Üí Returns shipped status",
                },
                {
                    "userMessage": "When will it arrive?",
                    "expectedBehavior": "Agent remembers order ‚Üí Provides ETA from previous lookup",
                },
                {
                    "userMessage": "What's the tracking number?",
                    "expectedBehavior": "Agent remembers order ‚Üí Provides tracking number",
                },
            ],
        },
        {
            "name": "FAQ Conversation",
            "description": "Customer asks multiple FAQ questions in one session",
            "turns": [
                {
                    "userMessage": "How do I reset my password?",
                    "expectedBehavior": "RAG ‚Üí Password reset instructions",
                },
                {
                    "userMessage": "And what about refunds?",
                    "expectedBehavior": "RAG ‚Üí Refund policy info",
                },
            ],
        },
        {
            "name": "Mixed Conversation - Context Test",
            "description": "Customer switches between order and FAQ topics",
            "turns": [
                {
                    "userMessage": "Check my order ORD-67890",
                    "expectedBehavior": "Tool call ‚Üí Processing status",
                },
                {
                    "userMessage": "How do I cancel my subscription?",
                    "expectedBehavior": "RAG ‚Üí Cancellation instructions (different topic)",
                },
                {
                    "userMessage": "Back to my order - what's the carrier?",
                    "expectedBehavior": "Agent should remember ORD-67890 from earlier",
                },
            ],
        },
    ]

    # Run all conversation scenarios
    all_responses: List[AgentResponse] = []
    session_ids: List[str] = []

    for scenario in scenarios:
        result = run_multi_turn_conversation(scenario)
        all_responses.extend(result["responses"])
        session_ids.append(result["sessionId"])

    # Flush traces
    print("\n‚è≥ Flushing traces...")
    tracer_provider.force_flush()
    print("‚úÖ Traces flushed!")

    # Collect feedback
    collect_user_feedback(all_responses)
    # Summary
    print("\n" + "=" * 60)
    print("üìä Sessions Summary")
    print("=" * 60)
    print(f"\n   Conversations: {len(scenarios)}")
    print(f"   Total turns: {len(all_responses)}")
    print("\n   Session IDs:")
    for i, session_id in enumerate(session_ids):
        print(f"   {i + 1}. {session_id} ({scenarios[i]['name']})")
    print("\n" + "=" * 60)
    print("## Viewing and Analyzing Sessions")
    print("=" * 60)
    print("\nWhat to look for:")
    print("   1. Click the 'Sessions' tab in your project")
    print("   2. You'll see each conversation as a separate session")
    print("   3. Click into a session to see the chatbot-like history")
    print("   4. Notice how all turns share the same session ID")
    print("   5. Check token usage and latency across the conversation")
    print("=" * 60)

## Running Multi-Turn Conversations

Run the multi-turn sessions demo to see how conversations are tracked as cohesive units.


In [None]:
run_sessions_demo()

### Session-Level Evaluations

Instead of manually reviewing every session, use LLM-as-Judge evaluation to automatically assess entire conversations. This helps answer questions like: Does the agent remember context? Are issues getting resolved? Where do conversations break down?

#### Conversation Coherence Evaluator

This evaluator checks if the agent maintained context throughout the conversation:

In [None]:
from phoenix.evals import LLM, create_classifier

llm = LLM(provider="openai", model="gpt-5")

conversation_coherence_evaluator = create_classifier(
    name="conversation_coherence",
    prompt_template="""You are evaluating whether a customer support agent maintained context throughout a multi-turn conversation.

A conversation is COHERENT if:
- The agent remembers information from earlier turns
- The agent doesn't ask for information already provided
- Responses build on previous context appropriately
- The conversation flows naturally

A conversation is INCOHERENT if:
- The agent "forgets" things the customer said earlier
- The agent asks for the same information multiple times
- Responses seem disconnected from previous turns
- The customer has to repeat themselves

[Full Conversation]:
{input}

Did the agent maintain context throughout this conversation?""",
    llm=llm,
    choices={"coherent": 1, "incoherent": 0},
)

#### Resolution Evaluator

This evaluator determines if the customer's issue was actually resolved:


In [None]:
resolution_evaluator = create_classifier(
    name="resolution_status",
    prompt_template="""You are evaluating whether a customer's issue was resolved in a support conversation.

The issue is RESOLVED if:
- The customer got the information they needed
- Their question was answered
- The conversation ended with the customer's needs met

The issue is UNRESOLVED if:
- The customer didn't get what they needed
- Questions went unanswered
- The agent couldn't help with the request

[Full Conversation]:
{input}

Was the customer's issue resolved?""",
    llm=llm,
    choices={"resolved": 1, "unresolved": 0},
)

### Running Session Evaluations
Fetch spans from Phoenix, group them by session ID, and evaluate each session:

In [None]:
from phoenix.client.resources.sessions import SessionAnnotationData

# Fetch all agent spans
spans = phoenix_client.spans.get_spans(
    project_identifier="support-bot",
    limit=200,
)

# Filter to agent spans and group by session ID
agent_spans = [span for span in spans if span.get("name") == "support-agent"]

session_groups: Dict[str, List[Any]] = {}
for span in agent_spans:
    # Access attributes (may be a dict or JSON string)
    attributes = span.get("attributes", {})
    if isinstance(attributes, str):
        attributes = json.loads(attributes)

    session_id = attributes.get("session.id") or attributes.get(SpanAttributes.SESSION_ID)
    if session_id:
        if session_id not in session_groups:
            session_groups[session_id] = []
        session_groups[session_id].append(span)

print(f"Found {len(session_groups)} sessions")

# Evaluate each session
session_annotations = []

for session_id, session_spans in session_groups.items():
    # Sort by turn number
    session_spans.sort(
        key=lambda s: (
            json.loads(s.get("attributes", "{}"))
            if isinstance(s.get("attributes"), str)
            else s.get("attributes", {})
        ).get("conversation.turn", 0)
    )

    # Build conversation transcript
    transcript_parts = []
    for i, span in enumerate(session_spans):
        # Access attributes
        attributes = span.get("attributes", {})
        if isinstance(attributes, str):
            attributes = json.loads(attributes)

        input_value = attributes.get("input.value", "")
        output_value = attributes.get("output.value", "")
        turn_num = attributes.get("conversation.turn", i + 1)

        transcript_parts.append(f"Turn {turn_num}:\nUser: {input_value}\nAgent: {output_value}")

    transcript = "\n\n".join(transcript_parts)

    if not transcript.strip():
        continue

    coherence_result = conversation_coherence_evaluator.evaluate({"input": transcript})
    coherence_score = (
        coherence_result[0] if isinstance(coherence_result, list) else coherence_result
    )

    resolution_result = resolution_evaluator.evaluate({"input": transcript})
    resolution_score = (
        resolution_result[0] if isinstance(resolution_result, list) else resolution_result
    )

    session_annotations.append(
        SessionAnnotationData(
            session_id=session_id,
            name="conversation_coherence",
            annotator_kind="LLM",
            result={
                "label": coherence_score.label,
                "score": coherence_score.score
                if hasattr(coherence_score, "score")
                else (1.0 if coherence_score.label == "coherent" else 0.0),
            },
            metadata={"model": "gpt-5", "turnCount": len(session_spans)},
        )
    )

    session_annotations.append(
        SessionAnnotationData(
            session_id=session_id,
            name="resolution_status",
            annotator_kind="LLM",
            result={
                "label": resolution_score.label,
                "score": resolution_score.score
                if hasattr(resolution_score, "score")
                else (1.0 if resolution_score.label == "resolved" else 0.0),
            },
            metadata={"model": "gpt-5", "turnCount": len(session_spans)},
        )
    )

print(f"\n‚úÖ Evaluated {len(session_groups)} sessions")

### Log Session Annotations

Log all session-level annotations to Phoenix:

In [None]:
if session_annotations:
    print(f"\nüì§ Logging {len(session_annotations)} session annotations to Phoenix...")

    try:
        phoenix_client.sessions.log_session_annotations(
            session_annotations=session_annotations,
            sync=False,
        )
        print(f"‚úÖ Logged {len(session_annotations)} session annotations")
        print(f"   - {len(session_annotations) // 2} sessions evaluated")
        print("   - Each session has 2 annotations: conversation_coherence and resolution_status")
    except Exception as error:
        print(f"‚ùå Failed to log session annotations: {error}")
else:
    print(
        "\n‚ö†Ô∏è  No session annotations to log. Make sure you've run multi-turn conversations first."
    )