## Measuring Tool Call Choice

You can build many tools. But calling all tools for all queries would be slow (and possibly harmful if they have side effects).

We benchmark tool retrieval with synthetic questions much like how we measured document retrieval in week 1. This section shows how to create the tool retrieval benchmark, and we'll improve retrieval in week 4.


## Load Products

We will test tool recall on a sample of our product inventory. Weload these products below.

In [1]:
import asyncio
from typing import List, Union
from pydantic import BaseModel, Field
import instructor
from openai import AsyncOpenAI
import lancedb
import random


class Product(BaseModel):
    title: str
    description: str


try:
    db = lancedb.connect("../week1_bootstrap_evals/lancedb")
    products = db.open_table("products").to_pandas()
    products = [
        Product(title=row["title"], description=row["description"])
        for _, row in products.iterrows()
    ]
except Exception as e:
    print(
        f"Error loading product data. Run the week1 course notebooks first to create the products DB"
    )
    print(f"Error: {str(e)}")

random.sample(products, 3)

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


[Product(title='Level', description='A 9-inch torpedo level that fits easily in your toolbox. The magnetic strip allows for hands-free use on metal surfaces.'),
 Product(title='Nail Gun', description='This cordless nail gun offers the convenience of battery power. The anti-jam mechanism ensures smooth operation.'),
 Product(title='Air Compressor', description='A portable 3-gallon air compressor with a built-in handle for easy transport. The quick-connect coupler allows for fast tool changes.')]

## Specify Types for Tool Calls

In [2]:
class ShippingDateRequest(BaseModel):
    '''Check when a product will be shipped'''
    call_name: str = "ShippingDateRequest"
    sku: str = Field(..., description="SKU of the product to check shipping date for")


class ShippingCostRequest(BaseModel):
    '''Check the cost of shipping a product'''
    sku: str = Field(..., description="SKU of the product to check shipping cost for")
    shipping_location: str = Field(..., description="Location to ship to")

class ProductDimensionsRequest(BaseModel):
    '''Check the dimensions of a product'''
    sku: str = Field(..., description="SKU of the product to check dimensions for")


class PriceHistoryRequest(BaseModel):
    '''Check the price history of a product (e.g. identifying historical price fluctuations)'''
    sku: str = Field(..., description="SKU of the product to check price history for")


class ProductComparisonRequest(BaseModel):
    '''Compare two products'''
    sku1: str = Field(..., description="SKU of the first product to compare")
    sku2: str = Field(..., description="SKU of the second product to compare")


class LogDesiredFeatureRequest(BaseModel):
    '''Record a user's desire for a certain product feature'''
    sku: str = Field(..., description="SKU of the product to log a desired feature for")
    user_id: str = Field(..., description="User ID to log the desired feature for")
    desired_feature: str = Field(..., description="Desired feature to log")


class ExtractDataFromImageRequest(BaseModel):
    '''Use our product images with multimodal llm to extract info about the product'''
    image_url: str = Field(..., description="URL of the image to examine")
    question: str = Field(..., description="Question to answer about the image")


class ProductMaterialsRequest(BaseModel):
    '''Check what materials a product is made of'''
    sku: str = Field(..., description="SKU of the product to check materials for")


FunctionOption = Union[
    ShippingDateRequest,
    ShippingCostRequest,
    ProductDimensionsRequest,
    PriceHistoryRequest,
    ProductComparisonRequest,
    LogDesiredFeatureRequest,
    ExtractDataFromImageRequest,
    ProductMaterialsRequest,
]

## Generate Synthetic Questions

In [3]:
async_client = instructor.from_openai(AsyncOpenAI())

def describe_tools(tools: List[FunctionOption]) -> str:
    return "\n".join([f"{tool.__name__}: {tool.__doc__}" for tool in tools])

class FunctionList(BaseModel):
    func_names: List[str]

class QuestionWithTools(BaseModel):
    question: str
    required_tools: FunctionList
    product: Product


def random_tool_selection() -> List[FunctionOption]:
    num_tools = random.choice([0, 1, 2])
    return random.sample(FunctionOption.__args__, num_tools)

async def generate_synthetic_question(product: Product) -> QuestionWithTools:
    tools_to_use = random_tool_selection()
    prompt = f"""
    Create a realistic question a customer might ask a support chatbot about this product:
    {product.title}: {product.description}

    The customer knows this is a programmatic chatbot. So they will be terse and lazy (possibly skipping whole/fully formed sentences).
    """
    if tools_to_use:
        prompt += f"""The question should require using these function calls: {describe_tools(tools_to_use)}
    
    Do not explicitly ask for the function. Instead, ask a question that happens to answerable by calling the function.

    For example:
    Instead of asking `how long shipping will take`, say `I need it by Friday. Can you make it?`
    Instead of asking for product dimensions, ask `Would this fit in a 3x7x4 case?`
    Instead of asking for the price history, ask `Is now a good time to buy?`

    Real questions tend to be implicit.
    Ask questions where it is hard to identify what tool(s) would help an LLM to answer the question.
    Assume that we will not make a tool call to look something up if it is already in the product description.

    Respond with the question.
    """
    else:
        prompt += f"""Respond with a question that can be answered without calling any of these tools:
        {describe_tools(FunctionOption.__args__)}
        """

    question = await async_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": "You are creating synthetic questions for benchmarking tool retrieval in a retail chatbot.",
            },
            {"role": "user", "content": prompt},
        ],
        response_model=str,
        temperature=0.,
    )
    tools_names = FunctionList(func_names=[tool.__name__ for tool in tools_to_use])
    return QuestionWithTools(
        question=question, required_tools=tools_names, product=product
    )


async def create_synthetic_dataset(products: List[Product], questions_per_product: int) -> List[QuestionWithTools]:
    tasks = [
        generate_synthetic_question(product)
        for product in products
        for _ in range(questions_per_product)
    ]
    return await asyncio.gather(*tasks)


synthetic_questions = await create_synthetic_dataset(products, questions_per_product=2)
print(f"Generated {len(synthetic_questions)} synthetic questions")

random.sample(synthetic_questions, 3)

Generated 180 synthetic questions


[QuestionWithTools(question='Has the price of this pliers set been stable? Also, what are they made of?', required_tools=FunctionList(func_names=['PriceHistoryRequest', 'ProductMaterialsRequest']), product=Product(title='Pliers Set', description='This 3-piece pliers set is perfect for household repairs. The cushioned grips provide comfort and control.')),
 QuestionWithTools(question="Will this fit in my toolbox? It's about 15 inches long and 5 inches wide.", required_tools=FunctionList(func_names=['ProductDimensionsRequest', 'ExtractDataFromImageRequest']), product=Product(title='Caulking Gun', description='A lightweight caulking gun with a built-in cutter and seal punch. The ladder hook provides convenient storage.')),
 QuestionWithTools(question='If I order the tape measure today, how much will shipping cost and when will it arrive?', required_tools=FunctionList(func_names=['ShippingDateRequest', 'ShippingCostRequest']), product=Product(title='Tape Measure', description='A 25-foot ta

## Test Whether Calling Correct Functions

We'll have a function that's used to retrieve tools (so you can use it broadly), and then another function for evaluation

In [4]:
async def choose_tools(question: str, product: Product) -> FunctionList:
    try:
        response = await async_client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": f"""Identify the tools that will help you answer the user's question? 
                    Respond with the names of 0, 1 or 2 tools to use. The available tools are
                    {describe_tools(FunctionOption.__args__)}.

                    For context, the user's question is about the following product:
                    {product.title}: {product.description}

                    Don't make function calls to look up information that is already in the product description.
                    """,
                },
                {"role": "user", "content": question},
            ],
            temperature=0.0,
            response_model=FunctionList,
        )
    except Exception as e:
        print(f"Error in API call: {str(e)}")
    return response

example_question = synthetic_questions[0]
print(example_question.question)
await choose_tools(example_question.question, example_question.product)

I need a hammer soon. If I order today, when will it arrive? Also, how does this hammer compare to the 20 oz claw hammer you have?


FunctionList(func_names=['ShippingDateRequest', 'ProductComparisonRequest'])

In [5]:
class ToolCallEvaluation(BaseModel):
    question: str
    expected: FunctionList
    predicted: FunctionList


async def get_tool_call_evals(q: QuestionWithTools) -> ToolCallEvaluation:
    predicted = await choose_tools(q.question, q.product)
    return ToolCallEvaluation(
        question=q.question,
        expected=q.required_tools,
        predicted=predicted,
    )


async def run_evaluation(
    synthetic_questions: List[QuestionWithTools], max_concurrency: int = 40
) -> List[ToolCallEvaluation]:
    semaphore = asyncio.Semaphore(max_concurrency)

    async def bounded_get_tool_call_evals(q: QuestionWithTools):
        async with semaphore:
            return await get_tool_call_evals(q)

    tasks = [bounded_get_tool_call_evals(q) for q in synthetic_questions]
    return await asyncio.gather(*tasks)


evaluation_results = await run_evaluation(synthetic_questions)

In [6]:
def calculate_precision_recall(evaluation_results: List[ToolCallEvaluation]):
    true_positives = sum(
        len(
            set(result.expected.func_names).intersection(
                set(result.predicted.func_names)
            )
        )
        for result in evaluation_results
    )
    false_positives = sum(
        len(set(result.predicted.func_names) - set(result.expected.func_names))
        for result in evaluation_results
    )
    false_negatives = sum(
        len(set(result.expected.func_names) - set(result.predicted.func_names))
        for result in evaluation_results
    )

    precision = (
        true_positives / (true_positives + false_positives)
        if (true_positives + false_positives) > 0
        else 0
    )
    recall = (
        true_positives / (true_positives + false_negatives)
        if (true_positives + false_negatives) > 0
        else 0
    )

    return precision, recall


precision, recall = calculate_precision_recall(evaluation_results)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Precision: 0.71
Recall: 0.97
