<center>
    <p style="text-align:center">
        <img alt="phoenix logo" src="https://raw.githubusercontent.com/Arize-ai/phoenix-assets/9e6101d95936f4bd4d390efc9ce646dc6937fb2d/images/socal/github-large-banner-phoenix.jpg" width="1000"/>
        <br>
        <br>
        <a href="https://docs.arize.com/phoenix/">Docs</a>
        |
        <a href="https://github.com/Arize-ai/phoenix">GitHub</a>
        |
        <a href="https://join.slack.com/t/arize-ai/shared_invite/zt-1px8dcmlf-fmThhDFD_V_48oU7ALan4Q">Community</a>
    </p>
</center>
<h1 align="center">Tracing CrewAI with Arize Phoenix - Orchestrator Workers Workflow</h1>

In [None]:
%pip install -q  'arize-phoenix==11.24.0' opentelemetry-sdk opentelemetry-exporter-otlp crewai crewai_tools openinference-instrumentation-crewai

# Set up Keys and Dependencies

Note: For this colab you'll need:

*   OpenAI API key (https://openai.com/)
*   Serper API key (https://serper.dev/)
*   Phoenix API key (https://app.phoenix.arize.com/)

In [None]:
import getpass
import os

# Prompt the user for their API keys if they haven't been set
openai_key = os.getenv("OPENAI_API_KEY", "OPENAI_API_KEY")
serper_key = os.getenv("SERPER_API_KEY", "SERPER_API_KEY")

if openai_key == "OPENAI_API_KEY":
    openai_key = getpass.getpass("Please enter your OPENAI_API_KEY: ")

if serper_key == "SERPER_API_KEY":
    serper_key = getpass.getpass("Please enter your SERPER_API_KEY: ")

# Set the environment variables with the provided keys
os.environ["OPENAI_API_KEY"] = openai_key
os.environ["SERPER_API_KEY"] = serper_key

if "PHOENIX_API_KEY" not in os.environ:
    os.environ["PHOENIX_API_KEY"] = getpass.getpass("Enter your Phoenix API key: ")

if "PHOENIX_API_KEY" not in os.environ:
    os.environ["PHOENIX_API_KEY"] = getpass.getpass("Enter your Phoenix API key: ")

os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={os.environ['PHOENIX_API_KEY']}"
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = ""

## Configure Tracing

In [None]:
from phoenix.otel import register

tracer_provider = register(project_name="crewai-agents")

# Instrument CrewAI

In [None]:
from openinference.instrumentation.crewai import CrewAIInstrumentor

CrewAIInstrumentor().instrument(skip_dep_check=True, tracer_provider=tracer_provider)

## Define your Agents

In [None]:
from crewai import Agent, Crew, Task

# Define worker agents
trend_researcher = Agent(
    role="AI Trend Researcher",
    goal="Analyze current advancements in AI",
    backstory="Expert in tracking and analyzing new trends in artificial intelligence.",
    verbose=True,
)

policy_analyst = Agent(
    role="AI Policy Analyst",
    goal="Examine the implications of AI regulations and governance",
    backstory="Tracks AI policy developments across governments and organizations.",
    verbose=True,
)

risk_specialist = Agent(
    role="AI Risk Specialist",
    goal="Identify potential risks in frontier AI development",
    backstory="Focuses on safety, alignment, and misuse risks related to advanced AI.",
    verbose=True,
)

synthesizer = Agent(
    role="Synthesis Writer",
    goal="Summarize all findings into a final cohesive report",
    backstory="Expert at compiling research insights into executive-level narratives.",
    verbose=True,
)

orchestrator = Agent(
    role="Orchestrator",
    goal=(
        "Your job is to delegate research and writing tasks to the correct coworker using the 'Delegate work to coworker' tool.\n"
        "For each task you assign, you MUST call the tool with the following JSON input:\n\n"
        "{\n"
        '  "task": "Short summary of the task to do (plain string)",\n'
        '  "context": "Why this task is important or part of the report (plain string)",\n'
        '  "coworker": "One of: AI Trend Researcher, AI Policy Analyst, AI Risk Specialist, Synthesis Writer"\n'
        "}\n\n"
        "IMPORTANT:\n"
        "- Do NOT format 'task' or 'context' as dictionaries.\n"
        "- Do NOT include types or nested descriptions.\n"
        "- Only use plain strings for both.\n"
        "- Call the tool multiple times, one per coworker."
    ),
    backstory="You are responsible for assigning each part of an AI report to the right specialist.",
    verbose=True,
    allow_delegation=True,
)

## Define your Tasks

In [None]:
# Define the initial task only for the orchestrator
initial_task = Task(
    description="Create an AI trends report. It should include recent innovations, policy updates, and safety risks. Then synthesize it into a unified summary.",
    expected_output="Assign subtasks via the DelegateWorkTool and return a final report.",
    agent=orchestrator,
)

# Set up the crew (no hierarchical process needed with delegation tools)
crew = Crew(
    agents=[trend_researcher, policy_analyst, risk_specialist, synthesizer],
    tasks=[initial_task],
    manager_agent=orchestrator,
    verbose=True,
)

# Run the full workflow
result = crew.kickoff()
print(result)

# Let's add some Evaluations (Evals)

In this section we will evaluate Agent Trajectory. 

See https://arize.com/docs/ax/evaluate/agent-trajectory-evaluations 

In [None]:
import phoenix as px

df = px.Client().get_spans_dataframe(project_name="crewai-agents", timeout=None)
llm_spans = df[df["span_kind"] == "LLM"]
root_ids = df[df["parent_id"].isna()]["context.trace_id"].unique()
llm_spans.head()

In [None]:
TRAJECTORY_ACCURACY_PROMPT_WITHOUT_REFERENCE = """
You are a helpful AI bot that checks whether an AI agent's internal trajectory is accurate and effective.

You will be given:
1. The agent's actual trajectory of tool calls
2. You will be given input data from a user that the agent used to make a decision
3. You will be given a tool call definition, what the agent used to make the tool call

An accurate trajectory:
- Progresses logically from step to step
- Follows the golden trajectory where reasonable
- Shows a clear path toward completing a goal
- Is reasonably efficient (doesn't take unnecessary detours)

##

Actual Trajectory:
{tool_calls}

Use Inputs:
{attributes.input.value}

Tool Definitions:
{attributes.llm.tools}

##

Your response must be a single string, either `correct` or `incorrect`, and must not include any additional text.

- Respond with `correct` if the agent's trajectory adheres to the rubric and accomplishes the task effectively.
- Respond with `incorrect` if the trajectory is confusing, misaligned with the goal, inefficient, or does not accomplish the task.
"""

In [None]:
from typing import Any, Dict

import pandas as pd


def filter_spans_by_trace_criteria(
    df: pd.DataFrame,
    trace_filters: Dict[str, Dict[str, Any]],
    span_filters: Dict[str, Dict[str, Any]],
) -> pd.DataFrame:
    """Filter spans based on trace-level and span-level criteria.

    Args:
        df: DataFrame with trace data
        trace_filters: Dictionary of column names and filtering criteria for traces
                      Format: {"column_name": {"operator": value}}
                      Supported operators: ">=", "<=", "==", "!=", "contains", "notna", "isna"
        span_filters: Dictionary of column names and filtering criteria for spans
                     Format: {"column_name": {"operator": value}}
                     Same supported operators as trace_filters

    Returns:
        DataFrame with filtered spans from traces that match trace_filters
    """
    all_trace_ids = set(df["context.trace_id"].unique())
    print(f"Total traces: {len(all_trace_ids)}")

    df_copy = df.copy()

    traces_df = df_copy.copy()
    for column, criteria in trace_filters.items():
        if column not in traces_df.columns:
            print(f"Warning: Column '{column}' not found in dataframe")
            continue

        for operator, value in criteria.items():
            if operator == ">=":
                matching_spans = traces_df[traces_df[column] >= value]
            elif operator == "<=":
                matching_spans = traces_df[traces_df[column] <= value]
            elif operator == "==":
                matching_spans = traces_df[traces_df[column] == value]
            elif operator == "!=":
                matching_spans = traces_df[traces_df[column] != value]
            elif operator == "contains":
                matching_spans = traces_df[
                    traces_df[column].str.contains(value, case=False, na=False)
                ]
            elif operator == "isna":
                matching_spans = traces_df[traces_df[column].isna()]
            elif operator == "notna":
                matching_spans = traces_df[traces_df[column].notna()]
            else:
                print(f"Warning: Unsupported operator '{operator}' - skipping")
                continue

            traces_df = matching_spans

    matching_trace_ids = set(traces_df["context.trace_id"].unique())
    print(f"Found {len(matching_trace_ids)} traces matching trace criteria")

    if not matching_trace_ids:
        print("No matching traces found")
        return pd.DataFrame()

    result_df = df[df["context.trace_id"].isin(matching_trace_ids)].copy()

    for column, criteria in span_filters.items():
        if column not in result_df.columns:
            print(f"Warning: Column '{column}' not found in dataframe")
            continue

        for operator, value in criteria.items():
            if operator == ">=":
                result_df = result_df[result_df[column] >= value]
            elif operator == "<=":
                result_df = result_df[result_df[column] <= value]
            elif operator == "==":
                result_df = result_df[result_df[column] == value]
            elif operator == "!=":
                result_df = result_df[result_df[column] != value]
            elif operator == "contains":
                result_df = result_df[result_df[column].str.contains(value, case=False, na=False)]
            elif operator == "isna":
                result_df = result_df[result_df[column].isna()]
            elif operator == "notna":
                result_df = result_df[result_df[column].notna()]
            else:
                print(f"Warning: Unsupported operator '{operator}' - skipping")
                continue

    print(f"Final result: {len(result_df)} spans from {len(matching_trace_ids)} traces")
    return result_df


def extract_tool_calls(output_messages):
    if not output_messages:
        return []

    tool_calls = []
    for message in output_messages:
        if "message.tool_calls" in message:
            for tool_call in message["message.tool_calls"]:
                tool_calls.append({"name": tool_call["tool_call.function.name"]})
    return tool_calls

In [None]:
from typing import Any, Dict

import pandas as pd

eval_traces = filter_spans_by_trace_criteria(
    df=df,
    trace_filters={"name": {"contains": "agent"}},
    span_filters={"attributes.openinference.span.kind": {"==": "LLM"}},
)

eval_traces.head()

In [None]:
eval_traces["tool_calls"] = eval_traces["attributes.llm.output_messages"].apply(extract_tool_calls)
eval_traces.head()
full_eval_spans = eval_traces[eval_traces["attributes.llm.tools"].notna()]

In [None]:
import nest_asyncio

from phoenix.evals import OpenAIModel, llm_classify
from phoenix.trace import suppress_tracing

nest_asyncio.apply()

model = OpenAIModel(
    api_key=os.environ["OPENAI_API_KEY"],
    model="gpt-4o-mini",
    temperature=0.0,
)

rails = ["correct", "incorrect"]

with suppress_tracing():
    eval_results = llm_classify(
        dataframe=full_eval_spans,
        template=TRAJECTORY_ACCURACY_PROMPT_WITHOUT_REFERENCE,
        model=model,
        rails=rails,
        provide_explanation=True,
        verbose=False,
        concurrency=20,
    )

eval_results["score"] = eval_results["label"].apply(lambda x: 1 if x == "correct" else 0)

In [None]:
import pandas as pd

merged_df = pd.merge(full_eval_spans, eval_results, left_index=True, right_index=True)

merged_df.rename(columns={"context.trace_id": "context.span_id"}, inplace=True)

merged_df.head()

In [None]:
from phoenix.trace import SpanEvaluations

px.Client().log_evaluations(
    SpanEvaluations(
        dataframe=merged_df,
        eval_name="Agent Trajectory Accuracy",
    )
)

### Check your Phoenix project to view the traces and spans from your runs.