# Phoenix Prompts Tutorial - Companion Notebook

This notebook accompanies the Phoenix Prompts Quickstart documentation. Follow along with the docs for detailed explanations.

**Prerequisites:**
- Phoenix running locally (`phoenix serve`)
- OpenAI API key set as `OPENAI_API_KEY` environment variable


In [None]:
import os
from getpass import getpass

# Set OpenAI API key if not already set
if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")

---
# Part 1: Find and Edit Prompts

## Step 1: Locate Bad Spans in Traces

First, let's build and trace a support agent to generate some traces we can inspect.


In [None]:
# Define the classification system prompt
system_prompt = """
Account Creation
Login Issues
Password Reset
Two-Factor Authentication
Profile Updates
Billing Inquiry
Refund Request
Subscription Upgrade/Downgrade
Payment Method Update
Invoice Request
Order Status
Shipping Delay
Product Return
Warranty Claim
Technical Bug Report
Feature Request
Integration Help
Data Export
Security Concern
Terms of Service Question
Privacy Policy Question
Compliance Inquiry
Accessibility Support
Language Support
Mobile App Issue
Desktop App Issue
Email Notifications
Marketing Preferences
Beta Program Enrollment
General Feedback

Return just the category, no other text for the support query.
"""

In [None]:
import json

import requests

url = "https://storage.googleapis.com/arize-phoenix-assets/assets/images/guidelines.json"
response = requests.get(url)

with open("guidelines.json", "wb") as f:
    f.write(response.content)

with open("guidelines.json", "r") as f:
    guidelines = json.load(f)

print("âœ… Loaded guidelines.json")

In [None]:
# Build and Trace Support Agent
from openai import OpenAI

from phoenix.otel import register

# Setup Phoenix tracing with auto-instrumentation for OpenAI
tracer_provider = register(project_name="support-agent", auto_instrument=True)
tracer = tracer_provider.get_tracer(__name__)

client = OpenAI()


@tracer.tool
def retrieve_guidelines(classification: str) -> str:
    """Retrieve guidelines based on the support query classification."""
    return guidelines.get(classification, "No guidelines found.")


@tracer.chain
def handle_support_query(query: str) -> str:
    # Step 1: Classify the query
    classification_response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": query},
        ],
    )
    classification = classification_response.choices[0].message.content

    # Step 2: Retrieve guidelines based on classification
    guideline = retrieve_guidelines(classification)

    # Step 3: Generate final response using guidelines
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": f"Respond to the support query using the following guidelines:\n{guideline}",
            },
            {"role": "user", "content": query},
        ],
    )
    return response.choices[0].message.content

In [None]:
# Run the agent on some test queries to generate traces
queries = [
    "warranty reg page says 404",
    "every time i click settings, bye",
    "when's dark mode? u said soon",
    "calendar sync eats my events",
    "cant dl my info, button grayed",
]

for query in queries:
    print(f"Query: {query}")
    result = handle_support_query(query)
    print(f"Response: {result[:200]}...")
    print("-" * 50)

## Step 2: Replay Span and Edit Prompt in Playground

Open Phoenix UI and navigate to your traces. Click on a span and use the **Playground** to:
1. Save the original prompt as `support-classifier`
2. Edit the prompt and test changes
3. Save the edited version as Version 2

## Step 3: Load Edited Prompt Back Into Your Code


In [None]:
from phoenix.client import Client

px_client = Client()

# Pull the latest version
prompt = px_client.prompts.get(prompt_identifier="support-classifier")

# Or pull a specific version
# prompt = px_client.prompts.get(prompt_version_id="YOUR_VERSION_ID")

print(f"Loaded prompt: {prompt._model_name}")

---
# Part 2: Test Prompts at Scale

## Step 1: Load Dataset of Inputs


In [None]:
import pandas as pd

from phoenix.client import Client

px_client = Client()

# Load our support query dataset
support_query_csv_url = (
    "https://storage.googleapis.com/arize-phoenix-assets/assets/images/support_queries.csv"
)
support_query_df = pd.read_csv(support_query_csv_url)

print(f"Loaded {len(support_query_df)} examples")
print(support_query_df.head())

In [None]:
# Upload dataset to Phoenix
support_query_dataset = px_client.datasets.create_dataset(
    dataframe=support_query_df,
    name="support-query-dataset",
    input_keys=["query"],
    output_keys=["ground_truth"],
)

print(f"âœ… Created dataset: {support_query_dataset.id}")

## Step 2: Run Experiment with Our Current Prompt

### Define Task Function


In [None]:
from openai import AsyncOpenAI

from phoenix.client import Client

async_openai_client = AsyncOpenAI()
px_client = Client()

prompt = px_client.prompts.get(prompt_identifier="support-classifier")
model = prompt._model_name
messages = prompt._template["messages"]

# Edit user prompt to match dataset input key "query"
messages[1]["content"][0]["text"] = "{{query}}"


async def task(input):
    task_messages = [
        {
            "role": m["role"],
            "content": [{"type": "text", "text": m["content"][0]["text"]}],
        }
        for m in messages
    ]
    task_messages[1]["content"][0]["text"] = input["query"]
    response = await async_openai_client.chat.completions.create(
        model=model,
        messages=task_messages,
    )
    return response.choices[0].message.content


print(f"Task defined with model: {model}")

### Define Evaluators


In [None]:
# Analysis evaluator template for rich feedback
analysis_evaluator_template = """
You are an expert support ticket classifier evaluator.

Your task: Given a user query, the predicted classification from a model, and the correct classification, decide if the prediction is correct, explain why, identify possible confusion reasons, highlight the exact part(s) of the query that best support the correct classification, and (if incorrect) label the type of error made.

Here are the available classes:

Account Creation, Login Issues, Password Reset, Two-Factor Authentication, Profile Updates,
Billing Inquiry, Refund Request, Subscription Upgrade/Downgrade, Payment Method Update, Invoice Request,
Order Status, Shipping Delay, Product Return, Warranty Claim, Technical Bug Report, Feature Request,
Integration Help, Data Export, Security Concern, Terms of Service Question, Privacy Policy Question,
Compliance Inquiry, Accessibility Support, Language Support, Mobile App Issue, Desktop App Issue,
Email Notifications, Marketing Preferences, Beta Program Enrollment, General Feedback

---

**Inputs:**
- Query: {query}
- Predicted classification: {output}
- Correct classification: {ground_truth}

---

**Error Type Definitions**:
- **broad_vs_specific** â†’ The model picked a broader category instead of the more specific correct one (or vice versa).
- **keyword_bias** â†’ The model latched onto an isolated keyword that led to the wrong class.
- **multi_intent_confusion** â†’ The query had multiple possible intents; model picked the less dominant one.
- **ambiguous_query** â†’ The query was unclear or underspecified.
- **off_topic** â†’ The query doesn't match any class well; model still guessed.
- **paraphrase_gap** â†’ The model failed to recognize a non-standard phrasing of the correct intent.
- **other** â†’ Any other reason.
- **none** â†’ Use only if correctness is "correct".

---

**Output Format (JSON)**:
  "correctness": "correct" or "incorrect",
  "explanation": "Brief explanation of why the predicted classification is correct or incorrect.",
  "confusion_reason": "If incorrect, explain why the model may have made this choice. If correct, say 'no confusion'.",
  "error_type": "One of the error types above. Use 'none' if correct.",
  "evidence_span": "Exact phrase(s) from the query that strongly indicate the correct classification.",
  "prompt_fix_suggestion": "One clear instruction to add to the classifier prompt to prevent this error."
"""

In [None]:
from phoenix.evals import create_evaluator
from phoenix.evals.llm import LLM

llm = LLM(provider="openai", model="gpt-4.1")


def normalize(label):
    return label.strip().strip('"').strip("'").lower()


async def ground_truth_evaluator(expected, output):
    """Simple evaluator: checks if output matches ground truth."""
    return normalize(expected.get("ground_truth")) == normalize(output)


SCHEMA = {
    "type": "object",
    "properties": {
        "correctness": {"type": "string", "enum": ["correct", "incorrect"]},
        "explanation": {"type": "string"},
        "confusion_reason": {"type": "string"},
        "error_type": {"type": "string"},
        "evidence_span": {"type": "string"},
        "prompt_fix_suggestion": {"type": "string"},
    },
    "required": [
        "correctness",
        "explanation",
        "confusion_reason",
        "error_type",
        "evidence_span",
        "prompt_fix_suggestion",
    ],
    "additionalProperties": False,
}


@create_evaluator(name="output_evaluator", kind="llm")
def analysis_evaluator(input, expected, output):
    """LLM evaluator: provides rich feedback on classification errors."""
    query = input.get("query")
    ground_truth = expected.get("ground_truth")

    prompt = (
        analysis_evaluator_template.replace("{query}", query)
        .replace("{ground_truth}", ground_truth)
        .replace("{output}", output)
    )
    obj = llm.generate_object(prompt=prompt, schema=SCHEMA)
    correctness = obj["correctness"]
    score = 1.0 if correctness == "correct" else 0.0
    explanation = (
        f"correctness: {correctness}; "
        f"explanation: {obj.get('explanation', '')}; "
        f"confusion_reason: {obj.get('confusion_reason', '')}; "
        f"error_type: {obj.get('error_type', '')}; "
        f"evidence_span: {obj.get('evidence_span', '')}; "
        f"prompt_fix_suggestion: {obj.get('prompt_fix_suggestion', '')};"
    )
    return {"score": score, "label": correctness, "explanation": explanation}


print("âœ… Evaluators defined")

### Run Experiment


In [None]:
from phoenix.client.experiments import async_run_experiment

experiment = await async_run_experiment(
    dataset=support_query_dataset,
    task=task,
    evaluators=[ground_truth_evaluator, analysis_evaluator],
    experiment_name="support-classifier-baseline",
)

## Step 3: Analyze Experiment Results

Navigate to the Phoenix UI to view experiment results. Filter for incorrect classifications:
```
evals["output_evaluator"].score == 0
```

Filter for broad_vs_specific errors:
```
'broad_vs_specific' in evals["output_evaluator"].explanation
```


---
# Part 3: Compare Prompt Versions

## Edit Prompt Template (Version 3)

Based on our analysis, we'll add an instruction to address broad_vs_specific errors.


In [None]:
from phoenix.client import Client
from phoenix.client.types.prompts import PromptVersion

px_client = Client()

# New instruction to address broad_vs_specific errors
broad_vs_specific_instruction = """When classifying user queries, always prefer the most specific applicable category over a broader one. If a query mentions a clear, concrete action or object (e.g., subscription downgrade, invoice, profile name), classify it under that specific intent rather than a general one (e.g., Billing Inquiry, General Feedback)."""

# Get existing prompt
existing = px_client.prompts.get(prompt_identifier="support-classifier")

# Modify the template
messages = existing._template["messages"]
messages[0]["content"][0]["text"] += "\n\n" + broad_vs_specific_instruction

# Create new version with modifications
new_version = PromptVersion(
    messages,
    model_name=existing._model_name,
    model_provider=existing._model_provider,
    template_format=existing._template_format,
    description="Added broad_vs_specific rule",
)

# Save as new version
version_3 = px_client.prompts.create(
    name="support-classifier",
    version=new_version,
)

print(f"âœ… Created Version 3: {version_3.id}")

## Edit Prompt Parameters (Version 4)

Now let's create another version with adjusted model parameters.


In [None]:
# Get existing prompt (fresh copy)
existing = px_client.prompts.get(prompt_identifier="support-classifier")

new_version = PromptVersion(
    existing._template["messages"],
    model_name="gpt-4.1-mini",
    model_provider=existing._model_provider,
    template_format="MUSTACHE",
    description="Using temperature=0.3, top_p=0.8, model_name=gpt-4.1-mini",
)

# Set invocation parameters
new_version._invocation_parameters = {
    "temperature": 0.3,
    "top_p": 0.8,
}

version_4 = px_client.prompts.create(
    name="support-classifier",
    version=new_version,
)

print(f"âœ… Created Version 4: {version_4.id}")

## Compare Prompt Versions

Copy the version IDs from Phoenix UI and run experiments to compare.


In [None]:
from openai import AsyncOpenAI

from phoenix.client import Client
from phoenix.client.experiments import async_run_experiment

px_client = Client()
async_openai_client = AsyncOpenAI()

# Get dataset
dataset = px_client.datasets.get_dataset(dataset="support-query-dataset")

# Version IDs - REPLACE WITH YOUR VERSION IDs FROM PHOENIX UI
VERSION_3 = "REPLACE_WITH_VERSION_3_ID"
VERSION_4 = "REPLACE_WITH_VERSION_4_ID"

# Get prompt versions
prompt_v3 = px_client.prompts.get(prompt_version_id=VERSION_3)
prompt_v4 = px_client.prompts.get(prompt_version_id=VERSION_4)

print(f"Version 3 model: {prompt_v3._model_name}")
print(f"Version 4 model: {prompt_v4._model_name}")

In [None]:
# Define reusable task factory
def create_task(prompt):
    model = prompt._model_name
    messages = prompt._template["messages"].copy()

    async def task(input):
        # Create a copy to avoid mutating the original
        task_messages = [
            {
                "role": m["role"],
                "content": [{"type": "text", "text": m["content"][0]["text"]}],
            }
            for m in messages
        ]
        task_messages[1]["content"][0]["text"] = input["query"]
        response = await async_openai_client.chat.completions.create(
            model=model,
            messages=task_messages,
        )
        return response.choices[0].message.content

    return task

In [None]:
# Run experiment with Version 3
print("ðŸ§ª Running experiment with Version 3...")
experiment_v3 = await async_run_experiment(
    dataset=dataset,
    task=create_task(prompt_v3),
    evaluators=[ground_truth_evaluator, analysis_evaluator],
    experiment_name="support-classifier-v3",
)

# Run experiment with Version 4
print("\nðŸ§ª Running experiment with Version 4...")
experiment_v4 = await async_run_experiment(
    dataset=dataset,
    task=create_task(prompt_v4),
    evaluators=[ground_truth_evaluator, analysis_evaluator],
    experiment_name="support-classifier-v4",
)

print(f"\nâœ… Compare results at: http://localhost:6006/datasets/{dataset.id}/experiments")

---
# Part 4: Optimize with Prompt Learning

## Install the Prompt Learning SDK

```bash
git clone https://github.com/priyanjindal/prompt-learning.git
cd prompt-learning
pip install .
```

## Load Experiment for Training


In [None]:
import os

import pandas as pd
import requests


def process_experiment(experiment_id, feedback_columns=None):
    """
    Fetch experiment data from Phoenix API and process it into a DataFrame.

    Args:
        experiment_id: The Phoenix experiment ID
        feedback_columns: List of feedback field names to extract from annotations

    Returns:
        pd.DataFrame: Processed experiment data
    """
    url = f"{os.environ.get('PHOENIX_COLLECTOR_ENDPOINT', 'http://localhost:6006')}/v1/experiments/{experiment_id}/json"
    headers = {}
    if os.environ.get("PHOENIX_API_KEY"):
        headers["Authorization"] = f"Bearer {os.environ['PHOENIX_API_KEY']}"

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise RuntimeError(
            f"Failed to fetch experiment data: {response.status_code} {response.text}"
        )

    results = response.json()

    # Build DataFrame from experiment results
    data = []
    for entry in results:
        row = {
            "input": entry.get("input", {}),
            "output": entry.get("output"),
            "ground_truth": entry.get("reference_output", {}).get("ground_truth"),
        }
        # Extract query from input
        if isinstance(row["input"], dict):
            row["query"] = row["input"].get("query", "")

        # Extract feedback from annotations
        if feedback_columns and entry.get("annotations"):
            # Find the output_evaluator annotation
            for annotation in entry["annotations"]:
                if annotation.get("name") == "output_evaluator":
                    eval_output = annotation.get("explanation", "")
                    for item in eval_output.split(";"):
                        if ":" in item:
                            key, value = item.split(":", 1)
                            key = key.strip()
                            if key in feedback_columns:
                                row[key] = value.strip()
                    break

        data.append(row)

    return pd.DataFrame(data)

In [None]:
# REPLACE with your Version 4 experiment ID from Phoenix UI
EXPERIMENT_V4_ID = "REPLACE_WITH_EXPERIMENT_V4_ID"

# Feedback columns from analysis_evaluator
feedback_columns = [
    "correctness",
    "explanation",
    "confusion_reason",
    "error_type",
    "evidence_span",
    "prompt_fix_suggestion",
]

processed_experiment_data = process_experiment(
    experiment_id=EXPERIMENT_V4_ID, feedback_columns=feedback_columns
)

print(f"Processed {len(processed_experiment_data)} rows")
print(processed_experiment_data.head())

## Load Unoptimized Prompt


In [None]:
import os

from phoenix.client import Client

px_client = Client()

# REPLACE with the prompt version ID you want to optimize
PROMPT_VERSION_ID = "REPLACE_WITH_PROMPT_VERSION_ID"
unoptimized_prompt = px_client.prompts.get(prompt_version_id=PROMPT_VERSION_ID)

# Extract system prompt from messages[0]
system_prompt = unoptimized_prompt._template["messages"][0]["content"][0]["text"]

print(f"Loaded system prompt ({len(system_prompt)} chars)")

## Optimize Prompt (Version 5)


In [None]:
from prompt_learning import PromptLearningOptimizer

# Initialize optimizer with existing system prompt
optimizer = PromptLearningOptimizer(
    prompt=system_prompt,
    model_choice="gpt-4o",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
)

# Run optimization
optimized_system_prompt = optimizer.optimize(
    dataset=processed_experiment_data,
    output_column="output",
    feedback_columns=feedback_columns,
    context_size_k=90000,
)

print("\n" + "=" * 60)
print("OPTIMIZED PROMPT")
print("=" * 60)
print(optimized_system_prompt[:500] + "...")

In [None]:
from phoenix.client.types.prompts import PromptVersion

# Build new messages with optimized system prompt
optimized_messages = [
    {"role": "system", "content": [{"type": "text", "text": optimized_system_prompt}]},
    {"role": "user", "content": [{"type": "text", "text": "{{query}}"}]},
]

# Create new version with optimized prompt
new_version = PromptVersion(
    optimized_messages,
    model_name=unoptimized_prompt._model_name,
    model_provider=unoptimized_prompt._model_provider,
    template_format="MUSTACHE",
    description="Optimized with Prompt Learning from V4 experiment",
)

# Preserve invocation parameters if any
if unoptimized_prompt._invocation_parameters:
    new_version._invocation_parameters = unoptimized_prompt._invocation_parameters

# Push to Phoenix
optimized_prompt = px_client.prompts.create(
    name="support-classifier",
    version=new_version,
)

print(f"âœ… Created optimized prompt version: {optimized_prompt.id}")

## Measure New Prompt Version's Performance


In [None]:
from phoenix.client.experiments import async_run_experiment

print("ðŸ§ª Running experiment with Prompt Learning optimized prompt...")

experiment_optimized = await async_run_experiment(
    dataset=support_query_dataset,
    task=create_task(optimized_prompt),
    evaluators=[ground_truth_evaluator, analysis_evaluator],
    experiment_name="support-classifier-optimized",
)

print(f"\nâœ… Experiment completed: {experiment_optimized.id}")
print("ðŸ“Š View results: http://localhost:6006/experiments")

---
# ðŸŽ‰ Summary

Congratulations! You've completed the Phoenix Prompts Tutorial!

**You've learned how to:**
- **Store and version prompts** in Phoenix's Prompt Hub
- **Create and upload datasets** to Phoenix from CSV files or DataFrames
- **Build custom evaluators** - both code-based and LLM-based with structured output
- **Run experiments** to test prompts at scale with automatic evaluation tracking
- **Compare prompt versions** side-by-side to measure the impact of changes
- **Optimize prompts with Prompt Learning** - using experiment feedback to automatically generate improvements
