## Measuring Tool Call Choice

You can build many tools. But calling all tools for all queries would be slow (and possibly harmful if they have side effects).

We benchmark tool retrieval with synthetic questions much like how we measured document retrieval in week 1. This section shows how to create the tool retrieval benchmark, and we'll improve retrieval in week 4.


## Load Products

We will test tool recall on a sample of our product inventory. Weload these products below.

In [1]:
import asyncio
from typing import List, Union
from pydantic import BaseModel
import instructor
from openai import AsyncOpenAI, OpenAI
import lancedb
import random


class Product(BaseModel):
    title: str
    description: str


try:
    db = lancedb.connect("../week1_bootstrap_evals/lancedb")
    products = db.open_table("products").to_pandas()[["title", "description"]]
    products = [
        Product(title=row["title"], description=row["description"])
        for _, row in products.iterrows()
    ]
except Exception as e:
    print(
        f"Error loading product data. Run the week1 course notebooks first to create the products DB"
    )
    print(f"Error: {str(e)}")

random.sample(products, 3)

[Product(title='Electric Screwdriver', description='Featuring a built-in LED light, this electric screwdriver allows for visibility in dimly lit areas. It also has an automatic spindle lock for easy bit changes, enhancing productivity during tasks.'),
 Product(title='Belt Sander', description='With a sturdy build, this belt sander can tackle heavy-duty tasks while maintaining precision. Its quick-change belt system allows for rapid tool adjustments.'),
 Product(title='Sprayer', description='This versatile sprayer is ideal for painting, stain, and sealant applications. Its adjustable settings provide control over spray patterns and flow, letting you achieve the desired finish.')]

## Specify Types for Tool Calls

In [2]:
class ShippingDateRequest(BaseModel):
    call_name: str = "ShippingDateRequest"
    sku: str


class ShippingCostRequest(BaseModel):
    call_name: str = "ShippingCostRequest"
    sku: str
    shipping_location: str


class QuestionAboutImageRequest(BaseModel):
    call_name: str = "QuestionAboutImageRequest"
    image_url: str
    question: str


class ProductDimensionsRequest(BaseModel):
    call_name: str = "ProductDimensionsRequest"
    sku: str


class PriceHistoryRequest(BaseModel):
    call_name: str = "PriceHistoryRequest"
    sku: str


class ProductComparisonRequest(BaseModel):
    call_name: str = "ProductComparisonRequest"
    sku1: str
    sku2: str


class LogDesiredFeatureRequest(BaseModel):
    call_name: str = "LogDesiredFeatureRequest"
    sku: str
    user_id: str
    desired_feature: str


class ExtractFromImageRequest(BaseModel):
    call_name: str = "ExtractFromImageRequest"
    image_url: str
    question: str


class ProductMaterialsRequest(BaseModel):
    call_name: str = "ProductMaterialsRequest"
    sku: str


ToolType = Union[
    ShippingDateRequest,
    ShippingCostRequest,
    QuestionAboutImageRequest,
    ProductDimensionsRequest,
    PriceHistoryRequest,
    ProductComparisonRequest,
    LogDesiredFeatureRequest,
    ExtractFromImageRequest,
    ProductMaterialsRequest,
]

all_tool_names = [cls.__name__ for cls in ToolType.__args__]

## Generate Synthetic Questions

In [3]:
async_client = instructor.from_openai(AsyncOpenAI())


class ToolNameList(BaseModel):
    tools: List[str]


class QuestionWithTools(BaseModel):
    question: str
    required_tools: ToolNameList
    product: Product


def random_tool_selection() -> ToolNameList:
    num_tools = random.choice([0, 1, 2])
    selected_tools = random.sample(all_tool_names, num_tools)
    return ToolNameList(tools=selected_tools)


async def generate_synthetic_question(product: Product) -> QuestionWithTools:
    tools_to_use = random_tool_selection()
    prompt = f"""
    Create a realistic question that a customer might ask an online chatbot about this product:
    {product.title}: {product.description}

    The customer knows this is just a programmatic chatbot. So they will be terse and lazy (possibly skipping whole/fully formed sentences).
    """
    if tools_to_use:
        prompt += f"""The question should require using these tools: {tools_to_use.tools}
    
    Do not explicitly ask for any tool/function.

    For example:
    Instead of asking `how long shipping will take`, say `I need it by Friday. Can you make it?`
    Instead of asking for product dimensions, ask `Does this fit in a 3x7x4 case?`
    Instead of asking for the price history, ask `Is now a good time to buy?`

    Make it tricky to identify the tool(s) that would help an LLM to answer the question.
    Real questions tend to be implicit.

    Also, assume that we will not make a tool call to look something up if it is already in the product description.

    Respond with the question.
    """
    else:
        prompt += f"""Respond with a question that can be answered without calling any of these tools:
        {all_tool_names}
        """

    question = await async_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": "You are creating synthetic questions for benchmarking tool retrieval in a retail chatbot.",
            },
            {"role": "user", "content": prompt},
        ],
        response_model=str,
        temperature=1.0,
    )
    return QuestionWithTools(
        question=question, required_tools=tools_to_use, product=product
    )


async def create_synthetic_dataset(
    products: List[Product], questions_per_product: int = 1
) -> List[QuestionWithTools]:
    tasks = [
        generate_synthetic_question(product)
        for product in products
        for _ in range(questions_per_product)
    ]
    return await asyncio.gather(*tasks)


synthetic_questions = await create_synthetic_dataset(products, questions_per_product=2)
print(f"Generated {len(synthetic_questions)} synthetic questions")

random.sample(synthetic_questions, 3)

Generated 186 synthetic questions


[QuestionWithTools(question='Can it handle multiple tools at the same time without losing power?', required_tools=ToolNameList(tools=[]), product=Product(title='Air Compressor', description='With a powerful motor and quick recovery time, this air compressor delivers reliable performance for both home and professional use. It features multiple outlets for various tools.')),
 QuestionWithTools(question='I often work in poorly lit places. Can this help?', required_tools=ToolNameList(tools=[]), product=Product(title='Cordless Drill', description='Versatile and efficient, this cordless drill is designed for both drilling and driving screws with ease. It includes an LED light to illuminate dark work areas, ensuring you can work effectively at any time of day.')),
 QuestionWithTools(question='Is this hammer a good buy right now? Also, can you show more details in the image?', required_tools=ToolNameList(tools=['ExtractFromImageRequest', 'PriceHistoryRequest']), product=Product(title='Hammer',

## Testing What We Call

We'll have a function that's used to retrieve tools (so you can use it broadly), and then another function for evaluation

In [4]:
async def choose_tools(question: str, product: Product) -> ToolNameList:
    try:
        response = await async_client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": f"""What tools will help you handle this user question? 
                    Respond with the names of 0, 1 or 2 tools from this list: 
                    {all_tool_names}.

                    For context, this is a description of the product being referred to:
                    {product.title}: {product.description}

                    Don't make function calls to look up information that is already in the product description.
                    """,
                },
                {"role": "user", "content": question},
            ],
            temperature=0.0,
            response_model=ToolNameList,
        )
    except Exception as e:
        print(f"Error in API call: {str(e)}")
    return response

example_question = synthetic_questions[0]
print(example_question.question)
await choose_tools(example_question.question, example_question.product)

Is this drill heavy for long use?


ToolNameList(tools=['ProductDimensionsRequest'])

In [5]:
class ToolCallEvaluation(BaseModel):
    question: str
    expected_tools: ToolNameList
    predicted_tools: ToolNameList


async def get_tool_call_evals(q: QuestionWithTools) -> ToolCallEvaluation:
    predicted_tools = await choose_tools(q.question, q.product)
    return ToolCallEvaluation(
        question=q.question,
        expected_tools=q.required_tools,
        predicted_tools=predicted_tools,
    )


async def run_evaluation(
    synthetic_questions: List[QuestionWithTools], max_concurrency: int = 40
) -> List[ToolCallEvaluation]:
    semaphore = asyncio.Semaphore(max_concurrency)

    async def bounded_get_tool_call_evals(q: QuestionWithTools):
        async with semaphore:
            return await get_tool_call_evals(q)

    tasks = [bounded_get_tool_call_evals(q) for q in synthetic_questions]
    return await asyncio.gather(*tasks)


evaluation_results = await run_evaluation(synthetic_questions)

In [6]:
def calculate_precision_recall(evaluation_results: List[ToolCallEvaluation]):
    true_positives = sum(
        len(
            set(result.expected_tools.tools).intersection(
                set(result.predicted_tools.tools)
            )
        )
        for result in evaluation_results
    )
    false_positives = sum(
        len(set(result.predicted_tools.tools) - set(result.expected_tools.tools))
        for result in evaluation_results
    )
    false_negatives = sum(
        len(set(result.expected_tools.tools) - set(result.predicted_tools.tools))
        for result in evaluation_results
    )

    precision = (
        true_positives / (true_positives + false_positives)
        if (true_positives + false_positives) > 0
        else 0
    )
    recall = (
        true_positives / (true_positives + false_negatives)
        if (true_positives + false_negatives) > 0
        else 0
    )

    return precision, recall


precision, recall = calculate_precision_recall(evaluation_results)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Precision: 0.63
Recall: 0.80
