# Evaluation Playground

Run agent evaluations and visualize results.
Uses evaluation datasets from `data/eval/`.

In [None]:
import requests
import json

API_URL = "http://localhost:4050"
API_KEY = ""

headers = {"Content-Type": "application/json"}
if API_KEY:
    headers["Authorization"] = f"Bearer {API_KEY}"

print("Evaluation Playground Ready")

In [None]:
# Load an evaluation dataset
with open("../data/eval/ceo-basic.json") as f:
    dataset = json.load(f)

print(f"Dataset: {dataset['name']}")
print(f"Cases: {len(dataset['cases'])}")
for case in dataset['cases']:
    print(f"  [{case['id']}] {case['input']['prompt'][:60]}...")

In [None]:
# Run evaluation cases via the API
results = []
for case in dataset['cases']:
    print(f"Running {case['id']}...", end=" ")
    try:
        r = requests.post(
            f"{API_URL}/api/agents/{case['agentId']}/run",
            headers=headers,
            json={"input": case['input']},
            timeout=30
        )
        results.append({"case_id": case['id'], "status": r.status_code, "response": r.json()})
        print(f"HTTP {r.status_code}")
    except Exception as e:
        results.append({"case_id": case['id'], "error": str(e)})
        print(f"ERROR: {e}")

print(f"\nCompleted {len(results)} / {len(dataset['cases'])} cases")