# Week 10: Agent Benchmarking

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Digital-AI-Finance/agentic-artificial-intelligence/blob/main/L10_Agent_Evaluation/L10_Benchmarking.ipynb)

Building a simple agent evaluation framework with multiple metrics.

In [None]:
import sys
if 'google.colab' in sys.modules:
    !pip install -q langchain-openai python-dotenv
    from google.colab import userdata
    import os
    os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [None]:
import time
from typing import List, Dict, Callable
from dataclasses import dataclass
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
print("Ready")

## 1. Define Evaluation Metrics

In [None]:
@dataclass
class EvalResult:
    task_id: str
    success: bool
    latency_ms: float
    token_count: int
    steps: int
    answer: str

@dataclass
class BenchmarkResult:
    success_rate: float
    avg_latency_ms: float
    avg_tokens: float
    avg_steps: float
    results: List[EvalResult]

## 2. Simple Agent Implementation

In [None]:
class SimpleAgent:
    def __init__(self, llm):
        self.llm = llm
        self.steps = 0
        self.tokens = 0
    
    def solve(self, task: str) -> str:
        self.steps = 0
        self.tokens = 0
        
        # Step 1: Understand task
        self.steps += 1
        understanding = self.llm.invoke(f"Briefly explain what this task requires: {task}").content
        self.tokens += len(understanding.split())
        
        # Step 2: Plan
        self.steps += 1
        plan = self.llm.invoke(f"Create a step-by-step plan to solve: {task}").content
        self.tokens += len(plan.split())
        
        # Step 3: Execute
        self.steps += 1
        answer = self.llm.invoke(f"Execute this plan and provide the answer:\nTask: {task}\nPlan: {plan}").content
        self.tokens += len(answer.split())
        
        return answer

agent = SimpleAgent(llm)
print("Agent ready")

## 3. Evaluation Framework

In [None]:
def evaluate_task(agent: SimpleAgent, task: Dict, judge_llm) -> EvalResult:
    """Evaluate agent on a single task."""
    start = time.time()
    answer = agent.solve(task["question"])
    latency = (time.time() - start) * 1000
    
    # Judge correctness
    judge_prompt = f"""Is this answer correct for the question?
Question: {task['question']}
Expected: {task['expected']}
Actual: {answer}

Reply YES or NO only."""
    
    judgment = judge_llm.invoke(judge_prompt).content.strip().upper()
    success = "YES" in judgment
    
    return EvalResult(
        task_id=task["id"],
        success=success,
        latency_ms=latency,
        token_count=agent.tokens,
        steps=agent.steps,
        answer=answer
    )

def run_benchmark(agent: SimpleAgent, tasks: List[Dict]) -> BenchmarkResult:
    """Run full benchmark."""
    results = []
    judge = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    
    for i, task in enumerate(tasks):
        print(f"Evaluating task {i+1}/{len(tasks)}...")
        result = evaluate_task(agent, task, judge)
        results.append(result)
        print(f"  {'PASS' if result.success else 'FAIL'} ({result.latency_ms:.0f}ms)")
    
    return BenchmarkResult(
        success_rate=sum(r.success for r in results) / len(results),
        avg_latency_ms=sum(r.latency_ms for r in results) / len(results),
        avg_tokens=sum(r.token_count for r in results) / len(results),
        avg_steps=sum(r.steps for r in results) / len(results),
        results=results
    )

## 4. Run Benchmark

In [None]:
# Sample benchmark tasks
tasks = [
    {"id": "math1", "question": "What is 15 * 23?", "expected": "345"},
    {"id": "math2", "question": "What is the square root of 144?", "expected": "12"},
    {"id": "logic1", "question": "If all cats are animals and some animals are pets, can we conclude all cats are pets?", "expected": "No"},
    {"id": "knowledge1", "question": "What is the capital of France?", "expected": "Paris"},
    {"id": "knowledge2", "question": "Who wrote Romeo and Juliet?", "expected": "Shakespeare"},
]

benchmark_result = run_benchmark(agent, tasks)

print(f"\n=== Benchmark Results ===")
print(f"Success Rate: {benchmark_result.success_rate:.1%}")
print(f"Avg Latency: {benchmark_result.avg_latency_ms:.0f}ms")
print(f"Avg Tokens: {benchmark_result.avg_tokens:.0f}")
print(f"Avg Steps: {benchmark_result.avg_steps:.1f}")

## Summary

Built a simple agent benchmarking framework with:
- **Metrics**: Success rate, latency, token usage, step count
- **LLM-as-Judge**: Automated correctness evaluation
- **Extensible**: Easy to add new task types