System Prompt:

You are a chatbot that answers customer questions for an e-commerce website. You have tools at your disposal to search for products, compare products, and escalate to a human customer support agent. Answer only when you're confident that your answer is correct and don't make up information. Ask for more information if needed.

In [None]:
import json
from typing import Any

import nest_asyncio
import pandas as pd
from openai import AsyncOpenAI, OpenAI
from openai.types.chat import ChatCompletionToolParam, ChatCompletionUserMessageParam
from openinference.instrumentation import using_metadata
from openinference.instrumentation.openai import OpenAIInstrumentor

import phoenix as px
from phoenix.experiments import evaluate_experiment
from phoenix.otel import register
from phoenix.trace import using_project

nest_asyncio.apply()

pd.set_option("display.max_colwidth", None)

In [None]:
product_details_tool: ChatCompletionToolParam = {
    "type": "function",
    "function": {
        "name": "product_details",
        "description": "Searches for a product by name and returns important details such as price and availability",
        "parameters": {
            "type": "object",
            "properties": {
                "name": {"type": "string", "description": "The name of the product being searched"}
            },
            "required": [],
        },
    },
}
product_search_tool: ChatCompletionToolParam = {
    "function": {
        "name": "product_search",
        "description": 'Searches for products by generic descriptions without specific product names (e.g., "high-end smartphones" or "energy-efficient appliances")',
        "parameters": {
            "type": "object",
            "properties": {
                "query": {"type": "string", "description": "The search query string."},
                "category": {
                    "type": "string",
                    "description": "The category to filter the search.",
                    "default": None,
                },
                "min_price": {
                    "type": "number",
                    "description": "The minimum price of the products to search.",
                    "default": 0,
                },
                "max_price": {
                    "type": "number",
                    "description": "The maximum price of the products to search.",
                    "default": None,
                },
            },
            "required": ["query"],
        },
    },
    "type": "function",
}
customer_support_tool: ChatCompletionToolParam = {
    "function": {
        "name": "customer_support",
        "description": "Escalates to a human customer support agent",
        "parameters": {
            "type": "object",
            "properties": {
                "issue_type": {
                    "type": "string",
                    "enum": ["billing", "account_management", "technical_support", "other"],
                    "description": "The type of issue",
                },
                "issue_description": {
                    "type": "string",
                    "description": "A description of the issue",
                },
            },
            "required": ["issue_type", "issue_description"],
        },
    },
    "type": "function",
}
tools = [
    product_details_tool,
    product_search_tool,
    customer_support_tool,
]
print(json.dumps(tools, indent=2))

In [None]:
generate_questions_prompt_template = """
You are an assistant that generates synthetic questions for a customer service chatbot on an e-commerce website.
The chatbot has access to a variety of tools described below.
You should generate questions that a user of the e-commerce site might ask of the chatbot service.
Your questions should contain some that can be answered with the tools provided, and some that cannot.

The tools available to the chatbot are:

{tools}

Your questions should contain a mix of the following:

- direct questions: straightforward questions that can naturally be answered with a single tool call, or that do not require any tools at all
- multiple categories: questions that naturally require more than one of the tools provided
- vague details: questions with limited or vague information that require clarification to categorize correctly
- mixed intentions: queries where the customer's goal or need is unclear or seems to conflict within the question itself

Respond with a list of ten questions with one question per line.
Do not include any numbering at the beginning of each line or any category headings.
After each question, you must include a double colon (::) followed by a comma-separated list of the tools that you would expect to be called immediately after the question is asked.
This list must be empty if the question should not be answered with a tool call.
"""
llm_client = OpenAI()
generate_questions_prompt = generate_questions_prompt_template.format(tools=json.dumps(tools))
response = llm_client.chat.completions.create(
    messages=[ChatCompletionUserMessageParam(content=generate_questions_prompt, role="user")],
    model="gpt-4o",
)
response_content = response.choices[0].message.content
assert response_content
print(response_content)

In [None]:
questions = []
expected_tool_calls = []
for line in response_content.strip().split("\n"):
    assert len(parts := line.split("::")) == 2
    questions.append(parts[0].strip())
    expected_tool_calls.append(list(map(lambda x: x.strip(), parts[1].split(","))))
df = pd.DataFrame({"question": questions, "expected_tool_calls": expected_tool_calls})
df

In [None]:
tracer_provider = register(endpoint="http://127.0.0.1:6006/v1/traces")
OpenAIInstrumentor(tracer_provider=tracer_provider).instrument()

In [None]:
with using_project("e-commerce"):
    responses = []
    for _, row in df.iterrows():
        with using_metadata(metadata={"expected_tool_calls": row["expected_tool_calls"]}):
            response = llm_client.chat.completions.create(
                model="gpt-4o",
                messages=[{"content": row["question"], "role": "user"}],
                tools=tools,
                temperature=0.1,
                max_tokens=2048,
                frequency_penalty=0.1,
                presence_penalty=0.2,
                stop=["supercalafragilisticexpialidocious"],
                top_p=0.9,
            )
            responses.append(response)

In [None]:
phoenix_client = px.Client()
dataset = phoenix_client.upload_dataset(
    dataset_name="e-commerce",
    inputs=[{"question": question} for question in questions],
    outputs=[{"expected_tool_calls": tool_calls} for tool_calls in expected_tool_calls],
)

## Evaluation

In [None]:
async_llm_client = AsyncOpenAI()

tool_calling_evaluation_prompt_template = """
You are an evaluation assistant evaluating questions and tool calls to
determine whether the tool(s) called answer the question. The tool
calls have been generated by a separate agent, and chosen from the list of
tools provided below. It is your job to decide whether that agent chose
the right tool(s) to call.

    [BEGIN DATA]
    ************
    [Question]: {question}
    ************
    [Tools Called]: {tool_calls}
    [END DATA]

Your response must be single word, either "correct" or "incorrect",
and should not contain any text or characters aside from that word.
"incorrect" means that the chosen tool would not answer the question,
the tool includes information that is not presented in the question,
or that the tool signature includes parameter values that don't match
the formats specified in the tool signatures below.

"correct" means the correct tool call(s) were chosen, the correct parameters
were extracted from the question, the tool call generated is runnable and correct,
and that no outside information not present in the question was used
in the generated question.

    [Tool Definitions]: {tool_definitions}
"""


async def judged_correct(input: dict[str, Any], output: dict[str, Any]) -> str:
    if len(messages := output.get("messages", {})) != 1:
        raise ValueError("expected exactly one message")
    tool_calls = messages[0].get("tool_calls", [])
    evaluation_prompt = tool_calling_evaluation_prompt_template.format(
        question=input["question"],
        tool_calls=json.dumps(tool_calls),
        tool_definitions=json.dumps(tools),
    )
    response = await async_llm_client.chat.completions.create(
        model="gpt-4o",
        messages=[{"content": evaluation_prompt, "role": "user"}],
        tools=tools,
    )
    assert isinstance(response_content := response.choices[0].message.content, str)
    response_content = response_content.strip()
    if response_content not in ("correct", "incorrect"):
        raise ValueError(f"expected 'correct' or 'incorrect', got {response_content}")
    return response_content

In [None]:
def matches_expected(output: dict[str, Any], expected: dict[str, Any]) -> str:
    if len(messages := output.get("messages", {})) != 1:
        raise ValueError("expected exactly one message")
    tool_calls = [
        tool_call.get("function", {}).get("name") for tool_call in messages[0].get("tool_calls", [])
    ]
    expected_tool_calls = expected["expected_tool_calls"]
    return "matches" if set(tool_calls) == set(expected_tool_calls) else "does not match"

In [None]:
phoenix_client = px.Client()

experiment_ids = [
    "RXhwZXJpbWVudDoyMjM=",
    "RXhwZXJpbWVudDoyMjQ=",
    "RXhwZXJpbWVudDoyMjU=",
]
for experiment_id in experiment_ids:
    experiment = phoenix_client.get_experiment(experiment_id=experiment_id)
    evaluate_experiment(experiment, evaluators=[matches_expected, judged_correct])