In [None]:
GEN_TEMPLATE = """
You are an assistant that generates complex customer service questions. You will try to answer the question with the tool if possible,
do your best to answer, ask for more information only if needed.
The questions should often involve:

Please reference the product names, the product details, product IDS and product information.

Multiple Categories: Questions that could logically fall into more than one category (e.g., combining product details with a discount code).
Vague Details: Questions with limited or vague information that require clarification to categorize correctly.
Mixed Intentions: Queries where the customer’s goal or need is unclear or seems to conflict within the question itself.
Indirect Language: Use of indirect or polite phrasing that obscures the direct need or request (e.g., using "I was wondering if..." or "Perhaps you could help me with...").
For specific categories:

track_package: Include vague timing references (e.g., "recently" or "a while ago") instead of specific dates.
product_comparison: Include generic descriptors without specific product names or IDs (e.g., "high-end smartphones" or "energy-efficient appliances").
product_search: Include generic descriptors without specific product names or IDs (e.g., "high-end smartphones" or "energy-efficient appliances").
apply_discount_code: Include questions about discounts that might apply to hypothetical or past situations, or without mentioning if they have made a purchase.
product_details: Ask for comparisons or details that involve multiple products or categories ambiguously (e.g., "Tell me about your range of electronics that are good for home office setups").
customer_support: Get contact information for customer support regarding an issue.

Examples of More Challenging Questions
Multiple Categories

"I recently bought a samsung 106i smart phone, and I was wondering if there's a way to check what deals I might have missed or if my order is on its way?"
"Could you tell me if the samsung 15H adapater in my last order are covered under warranty and if they have shipped yet?"
Vague Details

"There's an issue with one of the Vizio 14Y TV I think I bought last month—what should I do?"
"I need help with a iPhone 16H I ordered, or maybe I'm just looking for something new. Can you help?"
Mixed Intentions

"I'm not sure if I should ask for a refund or just find out when it will arrive. What do you suggest?"
"Could you help me decide whether to upgrade my product or just track the current one?"
Indirect Language

"I was wondering if you might assist me in figuring out a problem I have with an order, or maybe it's more of a query?"
"Perhaps you could help me understand the benefits of your premium products compared to the regular ones?"

Some questions should be straightforward uses of the provided functions

Respond with a list, one question per line. Do not include any numbering at the beginning of each line. Do not include any category headings.
Generate 20 questions. After each question, you must include a double colon (::) followed by a comma-separated list of the tools that you would expect to be called immediately after the question is asked.
This list should be empty if the question should not be answered immediately with a tool call.
"""

In [None]:
from openai import OpenAI
from openai.types.chat import ChatCompletionUserMessageParam

llm_client = OpenAI()

response = llm_client.chat.completions.create(
    messages=[ChatCompletionUserMessageParam(content=GEN_TEMPLATE, role="user")],
    model="gpt-4o",
)
response_content = response.choices[0].message.content
assert response_content
response_content

In [None]:
import pandas as pd

pd.set_option("display.max_colwidth", None)


questions = []
expected_tool_calls = []
for line in response_content.strip().split("\n"):
    assert len(parts := line.split("::")) == 2
    questions.append(parts[0].strip())
    expected_tool_calls.append(list(map(lambda x: x.strip(), parts[1].split(","))))
df = pd.DataFrame({"question": questions, "expected_tool_calls": expected_tool_calls})
df

In [None]:
from openinference.instrumentation.openai import OpenAIInstrumentor

from phoenix.otel import register

tracer_provider = register(endpoint="http://127.0.0.1:6006/v1/traces")
OpenAIInstrumentor(tracer_provider=tracer_provider).instrument()

In [None]:
from openai import OpenAI

llm_client = OpenAI()

In [None]:
from openai.types.chat import ChatCompletionToolParam

product_comparison_tool: ChatCompletionToolParam = {
    "function": {
        "name": "product_comparison",
        "description": "Compare features of two products.",
        "parameters": {
            "type": "object",
            "properties": {
                "product_a_id": {
                    "type": "string",
                    "description": "The unique identifier of Product A.",
                },
                "product_b_id": {
                    "type": "string",
                    "description": "The unique identifier of Product B.",
                },
            },
            "required": ["product_a_id", "product_b_id"],
        },
    },
    "type": "function",
}
product_search_tool: ChatCompletionToolParam = {
    "function": {
        "name": "product_search",
        "description": "Search for products based on criteria.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {"type": "string", "description": "The search query string."},
                "category": {
                    "type": "string",
                    "description": "The category to filter the search.",
                    "default": None,
                },
                "min_price": {
                    "type": "number",
                    "description": "The minimum price of the products to search.",
                    "default": 0,
                },
                "max_price": {
                    "type": "number",
                    "description": "The maximum price of the products to search.",
                    "default": None,
                },
                "page": {
                    "type": "integer",
                    "description": "The page number for pagination.",
                    "default": 1,
                },
                "page_size": {
                    "type": "integer",
                    "description": "The number of results per page.",
                    "default": 20,
                },
            },
            "required": ["query"],
        },
    },
    "type": "function",
}
customer_support_tool: ChatCompletionToolParam = {
    "function": {
        "name": "customer_support",
        "description": "Get contact information for customer support regarding an issue.",
        "parameters": {
            "type": "object",
            "properties": {
                "issue_type": {
                    "type": "string",
                    "description": "The type of issue (e.g., billing, technical support).",
                }
            },
            "required": ["issue_type"],
        },
    },
    "type": "function",
}
track_package_tool: ChatCompletionToolParam = {
    "function": {
        "name": "track_package",
        "description": "Track the status of a package based on the tracking number.",
        "parameters": {
            "type": "object",
            "properties": {
                "tracking_number": {
                    "type": "integer",
                    "description": "The tracking number of the package.",
                }
            },
            "required": ["tracking_number"],
        },
    },
    "type": "function",
}
product_details_tool: ChatCompletionToolParam = {
    "function": {
        "name": "product_details",
        "description": "Returns details for a given product id",
        "parameters": {
            "type": "object",
            "properties": {
                "product_id": {"type": "string", "description": "The id of a product to look up."}
            },
            "required": ["product_id"],
        },
    },
    "type": "function",
}
apply_discount_code_tool: ChatCompletionToolParam = {
    "function": {
        "name": "apply_discount_code",
        "description": "Applies the discount code to a given order.",
        "parameters": {
            "type": "object",
            "properties": {
                "order_id": {
                    "type": "integer",
                    "description": "The id of the order to apply the discount code to.",
                },
                "discount_code": {"type": "string", "description": "The discount code to apply"},
            },
            "required": ["order_id, discount_code"],
        },
    },
    "type": "function",
}
tools = [
    product_comparison_tool,
    product_search_tool,
    customer_support_tool,
    # track_package_tool,
    # product_details_tool,
    # apply_discount_code_tool,
]

In [None]:
from openinference.instrumentation import using_metadata

from phoenix.trace import using_project

with using_project("customer-support-tools"):
    responses = []
    for _, row in df.iterrows():
        with using_metadata(metadata={"expected_tool_calls": row["expected_tool_calls"]}):
            response = llm_client.chat.completions.create(
                model="gpt-4o",
                messages=[{"content": row["question"], "role": "user"}],
                tools=tools,
                temperature=0.1,
                max_tokens=2048,
                frequency_penalty=0.1,
                presence_penalty=0.2,
                stop=["supercalafragilisticexpialidocious"],
                top_p=0.9,
                seed=12,
            )
            responses.append(response)

In [None]:
import phoenix as px

phoenix_client = px.Client()
dataset = phoenix_client.upload_dataset(
    dataset_name="customer-support-dataset",
    inputs=[{"question": question} for question in questions],
    outputs=[{"expected_tool_calls": tool_calls} for tool_calls in expected_tool_calls],
)

In [None]:
import json
from typing import Any

from openai import AsyncOpenAI

from phoenix.evals.default_templates import TOOL_CALLING_BASE_TEMPLATE

async_llm_client = AsyncOpenAI()

"""
You are an evaluation assistant evaluating questions and tool calls to
determine whether the tool called would answer the question. The tool
calls have been generated by a separate agent, and chosen from the list of
tools provided below. It is your job to decide whether that agent chose
the right tool to call.

    [BEGIN DATA]
    ************
    [Question]: {question}
    ************
    [Tool Called]: {tool_call}
    [END DATA]

Your response must be single word, either "correct" or "incorrect",
and should not contain any text or characters aside from that word.
"incorrect" means that the chosen tool would not answer the question,
the tool includes information that is not presented in the question,
or that the tool signature includes parameter values that don't match
the formats specified in the tool signatures below.

"correct" means the correct tool call was chosen, the correct parameters
were extracted from the question, the tool call generated is runnable and correct,
and that no outside information not present in the question was used
in the generated question.

    [Tool Definitions]: {tool_definitions}
"""


async def evaluate_tool_call_correctness(input: dict[str, Any], output: dict[str, Any]) -> str:
    if (
        len(messages := output.get("messages", {})) != 1
        or len(tool_calls := messages[0].get("tool_calls", [])) != 1
    ):
        return "tool_not_called"
    evaluation_prompt = TOOL_CALLING_BASE_TEMPLATE.format(
        question=input["question"],
        tool_call=json.dumps(tool_calls[0]),
        tool_definitions=json.dumps(tools),
    )
    response = await async_llm_client.chat.completions.create(
        model="gpt-4o",
        messages=[{"content": evaluation_prompt, "role": "user"}],
        tools=tools,
    )
    assert isinstance(response_content := response.choices[0].message.content, str)
    if response_content not in TOOL_CALLING_PROMPT_RAILS_MAP.values():
        return "tool_not_called"
    return response_content

In [None]:
import phoenix as px
from phoenix.experiments import evaluate_experiment

phoenix_client = px.Client()

experiment_ids = ["RXhwZXJpbWVudDoyMDU=", "RXhwZXJpbWVudDoyMDY=", "RXhwZXJpbWVudDoyMDc="]
for experiment_id in experiment_ids:
    experiment = phoenix_client.get_experiment(experiment_id=experiment_id)
    evaluate_experiment(experiment, evaluators=[evaluate_tool_call_correctness])