[Reference](https://pub.towardsai.net/toon-vs-json-a-comprehensive-performance-comparison-446a2fb82f20)

# Project Setup

```
openai>=1.0.0
pandas>=2.0.0
matplotlib>=3.7.0
seaborn>=0.12.0
python-dotenv>=1.0.0
git+https://github.com/toon-format/toon-python.git
faker
```

```
# .env file
OPENAI_API_KEY = "YOUR_KEY_HERE"
```

# Dataset

In [3]:
!pip install faker

Collecting faker
  Downloading faker-38.2.0-py3-none-any.whl.metadata (16 kB)
Downloading faker-38.2.0-py3-none-any.whl (2.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.0/2.0 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-38.2.0


In [4]:
from faker import Faker
import random
import re

# List of departments to assign employees to
DEPARTMENTS = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance']

def slugify(name: str) -> str:
    """Convert a name to lowercase, alphanumeric slug suitable for email."""
    name = name.lower()
    name = re.sub(r"[^a-z0-9]+", "", name)  # remove accents, punctuation, etc.
    return name

def generate_employees(count: int, seed: int = None):
    """
    Generate synthetic employee data with consistent randomization.

    Args:
        count: Number of employees to generate
        seed: Random seed for reproducibility

    Returns:
        Dictionary with 'employees' key containing list of employee records
    """
    # Apply seed for reproducibility
    if seed is not None:
        random.seed(seed)
        faker = Faker()
        faker.seed_instance(seed)
    else:
        faker = Faker()

    employees = []

    for i in range(count):
        # Generate random name using Faker
        full_name = faker.name()
        parts = full_name.split()

        # Create email from name parts
        first = slugify(parts[0])
        last  = slugify(parts[-1])  # handles middle names gracefully

        email = f"{first}_{last}@example.com"

        # Random years of experience
        years_exp = random.randint(1, 25)

        # Construct employee record
        employee = {
            "id": i + 1,
            "name": full_name,
            "email": email,
            "department": DEPARTMENTS[i % len(DEPARTMENTS)],  # Round-robin assignment
            "salary": random.randint(45000, 150000),
            "yearsExperience": years_exp,
            "active": random.random() < 0.8,  # 80% active
        }

        employees.append(employee)

    return {"employees": employees}


# Generate 100 employees with seed for reproducibility

dataset = generate_employees(100, seed=42)
print(dataset["employees"][:5])  # preview first 5

[{'id': 1, 'name': 'Allison Hill', 'email': 'allison_hill@example.com', 'department': 'Engineering', 'salary': 59592, 'yearsExperience': 21, 'active': True}, {'id': 2, 'name': 'Noah Rhodes', 'email': 'noah_rhodes@example.com', 'department': 'Sales', 'salary': 77098, 'yearsExperience': 9, 'active': True}, {'id': 3, 'name': 'Angie Henderson', 'email': 'angie_henderson@example.com', 'department': 'Marketing', 'salary': 58434, 'yearsExperience': 24, 'active': True}, {'id': 4, 'name': 'Daniel Wagner', 'email': 'daniel_wagner@example.com', 'department': 'HR', 'salary': 56395, 'yearsExperience': 18, 'active': True}, {'id': 5, 'name': 'Cristian Santos', 'email': 'cristian_santos@example.com', 'department': 'Operations', 'salary': 48905, 'yearsExperience': 2, 'active': True}]


# Question Generation

In [5]:
employees = dataset["employees"]

questions = []

def q(id, prompt, gt, qtype, ans):
    """Helper function to create a question dictionary."""
    return {
        "id": id,
        "prompt": prompt,
        "groundTruth": gt,
        "type": qtype,
        "dataset": "tabular",
        "answerType": ans
    }

# ---- FIELD RETRIEVAL (30) ----
# Ask about specific employee attributes
for i, emp in enumerate(employees[:10]):  # 10 employees * 3 fields = 30
    questions.append(q(f"fr{i*3+1}", f"What is the salary of {emp['name']}?", str(emp["salary"]), "field-retrieval", "integer"))
    questions.append(q(f"fr{i*3+2}", f"What department does {emp['name']} work in?", emp["department"], "field-retrieval", "string"))
    questions.append(q(f"fr{i*3+3}", f"What is the email address of {emp['name']}?", emp["email"], "field-retrieval", "string"))

# ---- AGGREGATION (30) ----
# 1) Count by department (6 questions)
dept_counts = {d: sum(1 for e in employees if e["department"] == d) for d in DEPARTMENTS}
for idx, (d,c) in enumerate(dept_counts.items()):
    questions.append(q(f"ag{idx+1}", f"How many employees work in {d}?", str(c), "aggregation", "integer"))

# 2) Salary thresholds (10 questions)
thresholds = [50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 130000, 140000]
for i, t in enumerate(thresholds):
    c = sum(1 for e in employees if e["salary"] > t)
    questions.append(q(f"ag{7+i}", f"How many employees have a salary greater than {t}?", str(c), "aggregation", "integer"))

# 3) General statistics (4 questions)
total = len(employees)
avg_sal = round(sum(e["salary"] for e in employees)/total)
active = sum(e["active"] for e in employees)
inactive = total - active

questions.append(q("ag17", "How many employees are in the dataset?", str(total), "aggregation", "integer"))
questions.append(q("ag18", "What is the average salary across all employees?", str(avg_sal), "aggregation", "integer"))
questions.append(q("ag19", "How many employees are active?", str(active), "aggregation", "integer"))
questions.append(q("ag20", "How many employees are inactive?", str(inactive), "aggregation", "integer"))

# 4) Experience thresholds (10 questions)
exp_levels = [2,5,8,10,12,15,18,20,22,24]
for i, t in enumerate(exp_levels):
    c = sum(1 for e in employees if e["yearsExperience"] > t)
    questions.append(q(f"ag21{i}", f"How many employees have more than {t} years of experience?", str(c), "aggregation", "integer"))

# Trim aggregation to exactly 30
agg_questions = [q for q in questions if q["type"]=="aggregation"][:30]

# ---- FILTERING (30) ----
filter_questions = []

# 1) Department + salary filter (6 questions)
dept = DEPARTMENTS
for i,d in enumerate(dept):
    c = sum(1 for e in employees if e["department"]==d and e["salary"]>90000)
    filter_questions.append(q(f"ft{i+1}", f"How many employees in {d} have a salary greater than 90000?", str(c), "filtering", "integer"))

# 2) Active + experience filter (10 questions)
for idx,t in enumerate(exp_levels):
    c = sum(1 for e in employees if e["active"] and e["yearsExperience"] > t)
    filter_questions.append(q(f"ft10{idx}", f"How many active employees have more than {t} years of experience?", str(c), "filtering", "integer"))

# 3) Department + active filter (6 questions)
for i,d in enumerate(dept):
    c = sum(1 for e in employees if e["department"]==d and e["active"])
    filter_questions.append(q(f"ft20{i}", f"How many active employees work in {d}?", str(c), "filtering", "integer"))

# 4) Department + experience filter (8 questions)
for i,t in enumerate(exp_levels[:8]):
    c = sum(1 for e in employees if e["yearsExperience"] > t and e["department"]==DEPARTMENTS[i % 6])
    filter_questions.append(q(f"ft30{i}", f"How many employees in {DEPARTMENTS[i % 6]} have more than {t} years of experience?", str(c), "filtering", "integer"))

# Trim filter to 30
filter_questions = filter_questions[:30]

# Combine all question types: 30 field retrieval + 30 aggregation + 30 filtering = 90 total
final_questions = questions[:30] + agg_questions + filter_questions

# printing first 5 questions
len(final_questions), final_questions[:5]

(90,
 [{'id': 'fr1',
   'prompt': 'What is the salary of Allison Hill?',
   'groundTruth': '59592',
   'type': 'field-retrieval',
   'dataset': 'tabular',
   'answerType': 'integer'},
  {'id': 'fr2',
   'prompt': 'What department does Allison Hill work in?',
   'groundTruth': 'Engineering',
   'type': 'field-retrieval',
   'dataset': 'tabular',
   'answerType': 'string'},
  {'id': 'fr3',
   'prompt': 'What is the email address of Allison Hill?',
   'groundTruth': 'allison_hill@example.com',
   'type': 'field-retrieval',
   'dataset': 'tabular',
   'answerType': 'string'},
  {'id': 'fr4',
   'prompt': 'What is the salary of Noah Rhodes?',
   'groundTruth': '77098',
   'type': 'field-retrieval',
   'dataset': 'tabular',
   'answerType': 'integer'},
  {'id': 'fr5',
   'prompt': 'What department does Noah Rhodes work in?',
   'groundTruth': 'Sales',
   'type': 'field-retrieval',
   'dataset': 'tabular',
   'answerType': 'string'}])

# Evaluation

## Importing Required Libraries


In [6]:
import json
import os
from openai import OpenAI
from toon_format import encode, decode
from dotenv import load_dotenv

# Load OpenAI API key from environment variables
load_dotenv()
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

print("Libraries imported successfully!")

## Convert Employee Data to Toon Format

In [7]:
# # Convert dataset from JSON to TOON format using the encode function
employee_jsoan = dataset  # Already have this as dict
# employee_toon = encode(employee_json)

# # Display sample of both formats for comparison
# print("JSON Format (first 200 chars):")
# print(json.dumps(employee_json, indent=2)[:200])
# print("\n" + "="*50 + "\n")
# print("TOON Format (first 200 chars):")
# print(employee_toon[:200])
# print(f"\n\nTOON format type: {type(employee_toon)}")

## Defining LLM Call and Evaluation Functions

In [8]:
import time

def call_llm(data_str, question, data_format="JSON", model_name="gpt-5-nano"):
    """
    Query GPT model with employee data and a question, returning response + metadata.

    Args:
        data_str: Employee data as string (JSON or TOON format)
        question: Question to ask about the data
        data_format: "JSON" or "TOON" for context
        model_name: LLM model name to use

    Returns:
        Dict with:
        {
            "response": model output text,
            "input_tokens": prompt tokens used,
            "output_tokens": completion tokens used,
            "total_tokens": total tokens used,
            "time_taken_seconds": API call duration
        }
    """
    # System prompt guides the model on how to interpret the data
    system_prompt = f"""You are a helpful assistant that answers questions about employee data.
The data is provided in {data_format} format.
Analyze the data carefully and provide concise, accurate answers.
For numeric answers, provide ONLY the number without any additional text or explanation.
For text answers, provide ONLY the requested information."""

    # User prompt contains the data and question
    user_prompt = f"""Employee Data ({data_format} format):
{data_str}

Question: {question}

Answer:"""

    start_time = time.time()
    try:
        # Call OpenAI API
        response_obj = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],

        )
        elapsed = time.time() - start_time

        # Extract response content
        response_text = response_obj.choices[0].message.content.strip()

        # Extract token usage metadata
        usage = response_obj.usage
        input_tokens = usage.prompt_tokens if hasattr(usage, "prompt_tokens") else None
        output_tokens = usage.completion_tokens if hasattr(usage, "completion_tokens") else None
        total_tokens = usage.total_tokens if hasattr(usage, "total_tokens") else None

        return {
            "response": response_text,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "total_tokens": total_tokens,
            "time_taken_seconds": elapsed
        }

    except Exception as e:
        elapsed = time.time() - start_time
        return {
            "response": f"ERROR: {str(e)}",
            "input_tokens": None,
            "output_tokens": None,
            "total_tokens": None,
            "time_taken_seconds": elapsed
        }

In [9]:
def run_all_queries(employee_json, questions, data_format="JSON", model_name="gpt-5-nano"):
    """
    Runs the LLM on all questions using call_llm() and stores model outputs + metadata.

    Args:
        data_str: Employee data as string (JSON or TOON format)
        questions: List of question dictionaries
        data_format: "JSON" or "TOON"
        model_name: Model to use for queries

    Returns:
        List of dictionaries with full metadata for each question.
    """
    print(f"\nüöÄ Running queries on {len(questions)} questions...")
    print("=" * 60)

    outputs = []

    for i, q in enumerate(questions, 1):
        # Call the LLM for this question
        result = call_llm(data_str, q["prompt"], data_format, model_name=model_name)

        # Store results with metadata
        outputs.append({
            "id": q["id"],
            "question": q["prompt"],
            "type": q["type"],
            "ground_truth": q["groundTruth"],
            "model_response": result["response"],
            "input_tokens": result["input_tokens"],
            "output_tokens": result["output_tokens"],
            "total_tokens": result["total_tokens"],
            "time_taken_seconds": result["time_taken_seconds"]
        })

        # Progress indicator every 10 questions
        if i % 10 == 0:
            print(f"Progress: {i}/{len(questions)} completed...")

    print("\n‚úÖ Querying complete!")
    return outputs

In [10]:
def evaluate_results(results, model_name="gpt-5-nano"):
    """
    Evaluate correctness of previously generated model outputs and aggregate token usage.

    Args:
        results: List of question results from run_all_queries()
        model_name: Name of the model used for querying

    Returns:
        Summary statistics + accuracy breakdown + token usage
    """
    correct = 0
    total = len(results)
    total_input_tokens = 0
    total_output_tokens = 0

    # Evaluate each result by comparing response to ground truth
    for r in results:
        r["correct"] = (
            r["model_response"].lower().strip() ==
            r["ground_truth"].lower().strip()
        )
        if r["correct"]:
            correct += 1

        # Aggregate tokens (handle None safely)
        total_input_tokens += r.get("input_tokens", 0) or 0
        total_output_tokens += r.get("output_tokens", 0) or 0

    # Calculate overall accuracy
    accuracy = (correct / total) * 100

    # Compute stats by question type
    type_stats = {}
    for qtype in ["field-retrieval", "aggregation", "filtering"]:
        type_subset = [r for r in results if r["type"] == qtype]
        type_correct = sum(1 for r in type_subset if r["correct"])
        type_total = len(type_subset)
        type_stats[qtype] = {
            "correct": type_correct,
            "total": type_total,
            "accuracy": (type_correct / type_total * 100) if type_total else 0
        }

    # Display summary
    print("\nüìä Evaluation Results")
    print("=" * 60)
    print(f"Overall Accuracy: {correct}/{total} ({accuracy:.2f}%)")
    print(f"Total Input Tokens: {total_input_tokens}")
    print(f"Total Output Tokens: {total_output_tokens}")

    return {
        "model_name": model_name,
        "overall_accuracy": accuracy,
        "correct": correct,
        "total": total,
        "type_stats": type_stats,
        "total_input_tokens": total_input_tokens,
        "total_output_tokens": total_output_tokens,
        "results": results
    }

## Run Evaluation for JSON format

In [11]:
# Run all 90 questions against the JSON format
# This will make 90 API calls and may take a few minutes
outputs_json = run_all_queries(employee_json, final_questions, data_format="JSON")
outputs_json

In [12]:
# Convert outputs to DataFrame and save as CSV
import pandas as pd
outputs_df = pd.DataFrame(outputs_json)
outputs_df.to_csv('json_format_results.csv', index=False)
print(f"‚úÖ Saved JSON format results to 'json_format_results.csv'")
print(f"   Shape: {outputs_df.shape}")
print(f"\nPreview:")
print(outputs_df.head())

## Run Evaluation for TOON format


In [13]:
# Run all 90 questions against the TOON format
# This will make another 90 API calls and may take a few minutes
outputs_toon = run_all_queries(employee_toon, final_questions, data_format="TOON")
outputs_toon

## Saving the Data

In [14]:
# Convert outputs to DataFrame and save as CSV
import pandas as pd
outputs_df_toon = pd.DataFrame(outputs_toon)
outputs_df_toon.to_csv('toon_format_results.csv', index=False)
print(f"‚úÖ Saved JSON format results to 'toon_format_results.csv'")
print(f"   Shape: {outputs_df_toon.shape}")
print(f"\nPreview:")
outputs_df_toon.head()

In [15]:
# Evaluate the TOON format results
toon_results = evaluate_results(outputs_toon)

## Compare Results

In [16]:
import pandas as pd

# Create comprehensive comparison of JSON vs TOON formats
print("="*70)
print(" "*20 + "üìä COMPREHENSIVE COMPARISON")
print("="*70)

# ===== CALCULATE TOKEN COUNTS FROM DATAFRAMES =====
# Use average input tokens from the raw data (each row has input_tokens)
json_tokens = int(outputs_df_json['input_tokens'].mean())
toon_tokens = int(outputs_df_toon['input_tokens'].mean())
token_reduction = json_tokens - toon_tokens
reduction_percentage = (token_reduction / json_tokens) * 100

# ===== TOKEN COUNT COMPARISON =====
print("\nüî¢ TOKEN COUNT COMPARISON:")
print("-"*70)
print(f"{'Format':<15} {'Tokens':<15} {'Reduction':<20} {'Cost/Call'}")
print("-"*70)
print(f"{'JSON':<15} {json_tokens:>10,}      {'-':<20} ${(json_tokens/1_000_000)*0.150:.6f}")
print(f"{'TOON':<15} {toon_tokens:>10,}      {token_reduction:>6,} ({reduction_percentage:>5.2f}%)     ${(toon_tokens/1_000_000)*0.150:.6f}")
print("-"*70)

# ===== CALCULATE ACCURACY FROM DATAFRAMES =====
# Overall accuracy
json_overall_acc = (outputs_df_json['correct'].sum() / len(outputs_df_json)) * 100
toon_overall_acc = (outputs_df_toon['correct'].sum() / len(outputs_df_toon)) * 100

# Accuracy by question type
json_fr_acc = (outputs_df_json[outputs_df_json['type'] == 'field-retrieval']['correct'].sum() /
               len(outputs_df_json[outputs_df_json['type'] == 'field-retrieval'])) * 100
json_ag_acc = (outputs_df_json[outputs_df_json['type'] == 'aggregation']['correct'].sum() /
               len(outputs_df_json[outputs_df_json['type'] == 'aggregation'])) * 100
json_ft_acc = (outputs_df_json[outputs_df_json['type'] == 'filtering']['correct'].sum() /
               len(outputs_df_json[outputs_df_json['type'] == 'filtering'])) * 100

toon_fr_acc = (outputs_df_toon[outputs_df_toon['type'] == 'field-retrieval']['correct'].sum() /
               len(outputs_df_toon[outputs_df_toon['type'] == 'field-retrieval'])) * 100
toon_ag_acc = (outputs_df_toon[outputs_df_toon['type'] == 'aggregation']['correct'].sum() /
               len(outputs_df_toon[outputs_df_toon['type'] == 'aggregation'])) * 100
toon_ft_acc = (outputs_df_toon[outputs_df_toon['type'] == 'filtering']['correct'].sum() /
               len(outputs_df_toon[outputs_df_toon['type'] == 'filtering'])) * 100

# ===== ACCURACY COMPARISON =====
print("\nüéØ ACCURACY COMPARISON:")
print("-"*70)
print(f"{'Format':<15} {'Overall':<15} {'Field Retrieval':<20} {'Aggregation':<15} {'Filtering'}")
print("-"*70)

# JSON accuracy metrics
print(f"{'JSON':<15} {json_overall_acc:>6.2f}%        {json_fr_acc:>6.2f}%               {json_ag_acc:>6.2f}%          {json_ft_acc:>6.2f}%")

# TOON accuracy metrics
print(f"{'TOON':<15} {toon_overall_acc:>6.2f}%        {toon_fr_acc:>6.2f}%               {toon_ag_acc:>6.2f}%          {toon_ft_acc:>6.2f}%")

print("-"*70)

# ===== DETAILED STATS BY CATEGORY =====
print("\nüìà DETAILED ACCURACY BY QUESTION TYPE:")
print("-"*70)

comparison_data = []
for qtype, qtype_label in [("field-retrieval", "Field Retrieval"),
                            ("aggregation", "Aggregation"),
                            ("filtering", "Filtering")]:
    json_subset = outputs_df_json[outputs_df_json['type'] == qtype]
    toon_subset = outputs_df_toon[outputs_df_toon['type'] == qtype]

    json_correct = json_subset['correct'].sum()
    json_total = len(json_subset)
    json_acc = (json_correct / json_total) * 100 if json_total > 0 else 0

    toon_correct = toon_subset['correct'].sum()
    toon_total = len(toon_subset)
    toon_acc = (toon_correct / toon_total) * 100 if toon_total > 0 else 0

    comparison_data.append({
        "Question Type": qtype_label,
        "JSON Accuracy": f"{json_correct}/{json_total} ({json_acc:.2f}%)",
        "TOON Accuracy": f"{toon_correct}/{toon_total} ({toon_acc:.2f}%)",
        "Difference": f"{toon_acc - json_acc:+.2f}%"
    })

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.to_string(index=False))

# ===== KEY FINDINGS SUMMARY =====
print("\n" + "="*70)
print("\nüí° KEY FINDINGS:")
print(f"   ‚Ä¢ Token reduction: {reduction_percentage:.2f}% ({token_reduction:,} tokens saved)")
print(f"   ‚Ä¢ JSON accuracy: {json_overall_acc:.2f}%")
print(f"   ‚Ä¢ TOON accuracy: {toon_overall_acc:.2f}%")
print(f"   ‚Ä¢ Accuracy difference: {toon_overall_acc - json_overall_acc:+.2f}%")
print("="*70)

## Visualize Results:

In [17]:
import matplotlib.pyplot as plt
import numpy as np

# Calculate token counts from DataFrames
json_tokens = int(outputs_df_json['input_tokens'].mean())
toon_tokens = int(outputs_df_toon['input_tokens'].mean())
token_reduction = json_tokens - toon_tokens
reduction_percentage = (token_reduction / json_tokens) * 100

# Create figure with two side-by-side subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# ===== SUBPLOT 1: TOKEN COUNT COMPARISON =====
ax1 = axes[0]
formats = ['JSON', 'TOON']
tokens = [json_tokens, toon_tokens]
colors = ['#3498db', '#2ecc71']

# Create bar chart
bars = ax1.bar(formats, tokens, color=colors, alpha=0.7, edgecolor='black')
ax1.set_ylabel('Token Count', fontsize=12, fontweight='bold')
ax1.set_title('Token Count Comparison\n(Lower is Better)', fontsize=14, fontweight='bold')
ax1.grid(axis='y', alpha=0.3, linestyle='--')

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height):,}',
             ha='center', va='bottom', fontsize=11, fontweight='bold')

# Add reduction annotation with arrow
reduction_text = f'{reduction_percentage:.1f}% reduction'
ax1.annotate(reduction_text, xy=(1, toon_tokens), xytext=(0.5, json_tokens - 200),
             arrowprops=dict(arrowstyle='->', color='red', lw=2),
             fontsize=11, color='red', fontweight='bold',
             bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.7))

# ===== SUBPLOT 2: ACCURACY COMPARISON BY TYPE =====
ax2 = axes[1]
question_types = ['Field\nRetrieval', 'Aggregation', 'Filtering', 'Overall']

# Calculate accuracy data from DataFrames for both formats
json_fr_acc = (outputs_df_json[outputs_df_json['type'] == 'field-retrieval']['correct'].sum() /
               len(outputs_df_json[outputs_df_json['type'] == 'field-retrieval'])) * 100
json_ag_acc = (outputs_df_json[outputs_df_json['type'] == 'aggregation']['correct'].sum() /
               len(outputs_df_json[outputs_df_json['type'] == 'aggregation'])) * 100
json_ft_acc = (outputs_df_json[outputs_df_json['type'] == 'filtering']['correct'].sum() /
               len(outputs_df_json[outputs_df_json['type'] == 'filtering'])) * 100
json_overall_acc = (outputs_df_json['correct'].sum() / len(outputs_df_json)) * 100

toon_fr_acc = (outputs_df_toon[outputs_df_toon['type'] == 'field-retrieval']['correct'].sum() /
               len(outputs_df_toon[outputs_df_toon['type'] == 'field-retrieval'])) * 100
toon_ag_acc = (outputs_df_toon[outputs_df_toon['type'] == 'aggregation']['correct'].sum() /
               len(outputs_df_toon[outputs_df_toon['type'] == 'aggregation'])) * 100
toon_ft_acc = (outputs_df_toon[outputs_df_toon['type'] == 'filtering']['correct'].sum() /
               len(outputs_df_toon[outputs_df_toon['type'] == 'filtering'])) * 100
toon_overall_acc = (outputs_df_toon['correct'].sum() / len(outputs_df_toon)) * 100

json_accuracies = [json_fr_acc, json_ag_acc, json_ft_acc, json_overall_acc]
toon_accuracies = [toon_fr_acc, toon_ag_acc, toon_ft_acc, toon_overall_acc]

# Create grouped bar chart
x = np.arange(len(question_types))
width = 0.35

bars1 = ax2.bar(x - width/2, json_accuracies, width, label='JSON',
                color='#3498db', alpha=0.7, edgecolor='black')
bars2 = ax2.bar(x + width/2, toon_accuracies, width, label='TOON',
                color='#2ecc71', alpha=0.7, edgecolor='black')

ax2.set_ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
ax2.set_title('Accuracy Comparison by Question Type\n(Higher is Better)',
              fontsize=14, fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels(question_types, fontsize=10)
ax2.legend(fontsize=11)
ax2.grid(axis='y', alpha=0.3, linestyle='--')
ax2.set_ylim(0, 105)

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height,
                 f'{height:.1f}%',
                 ha='center', va='bottom', fontsize=9, fontweight='bold')

# Save and display
plt.tight_layout()
plt.savefig('json_vs_toon_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("üìä Visualization saved as 'json_vs_toon_comparison.png'")

## Error Analysis

In [18]:
# Identify questions where the model gave incorrect answers
json_errors = outputs_df_json[~outputs_df_json['correct']].to_dict('records')
toon_errors = outputs_df_toon[~outputs_df_toon['correct']].to_dict('records')

print(f"üîç ERROR ANALYSIS:")
print("="*70)
print(f"\nJSON Format: {len(json_errors)} errors out of {len(outputs_df_json)} questions")
print(f"TOON Format: {len(toon_errors)} errors out of {len(outputs_df_toon)} questions")

# ===== SHOW SAMPLE ERRORS FROM JSON FORMAT =====
print(f"\nüìù Sample Errors from JSON Format (first 5):")
print("-"*70)
for i, err in enumerate(json_errors[:5], 1):
    print(f"\n{i}. Question: {err['question']}")
    print(f"   Type: {err['type']}")
    print(f"   Expected: {err['ground_truth']}")
    print(f"   Got: {err['model_response']}")

# ===== SHOW SAMPLE ERRORS FROM TOON FORMAT =====
print(f"\n\nüìù Sample Errors from TOON Format (first 5):")
print("-"*70)
for i, err in enumerate(toon_errors[:5], 1):
    print(f"\n{i}. Question: {err['question']}")
    print(f"   Type: {err['type']}")
    print(f"   Expected: {err['ground_truth']}")
    print(f"   Got: {err['model_response']}")

# ===== ANALYZE DIFFERENTIAL ERRORS =====
# Find questions that failed in one format but not the other
json_error_ids = set(outputs_df_json[~outputs_df_json['correct']]['id'])
toon_error_ids = set(outputs_df_toon[~outputs_df_toon['correct']]['id'])

only_json_errors = json_error_ids - toon_error_ids
only_toon_errors = toon_error_ids - json_error_ids

print(f"\n\nüîÑ DIFFERENTIAL ERRORS:")
print("="*70)
print(f"Questions that failed ONLY with JSON: {len(only_json_errors)}")
print(f"Questions that failed ONLY with TOON: {len(only_toon_errors)}")
print(f"Questions that failed with BOTH: {len(json_error_ids & toon_error_ids)}")