# Model Performance Benchmark

This notebook tests different LLM models for scoring Italian exercises.

**Models tested:**
- gpt-4o-mini
- gpt-3.5-turbo-0125
- gpt-3.5-turbo
- gpt-4.1-mini

**Metrics:**
- Inference time per model
- Scoring breakdown per test case

In [3]:
# Install dependencies if running in Colab
import sys

if 'google.colab' in sys.modules:
    print("Running in Google Colab. Installing dependencies...")
    !pip install -q spacy transformers torch sentence-transformers openai httpx json5
    !python -m spacy download it_core_news_sm

    # Mount Google Drive to access the files
    from google.colab import drive
    drive.mount('/content/drive')

    # Set the working directory to your project
    import os
    os.chdir('/content/drive/My Drive/Colab Notebooks/italian_teacher')
    sys.path.insert(0, '/content/drive/My Drive/Colab Notebooks/italian_teacher')

    # Set OpenAI API key
    from google.colab import userdata
    os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

print("✅ Setup complete")

Running in Google Colab. Installing dependencies...
Collecting it-core-news-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.8.0/it_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Setup complete


In [None]:
import asyncio
import json
import time
import random
from pathlib import Path
from typing import Dict, List

# Import the reward function
from src.rl.reward_function.reward_function_modular import ExerciseRewardFunction
from src.rl.reward_function.scorers.base_llm_scorer import BaseLLMScorer

print("✅ Imports successful")

In [None]:
# Load validation requests and sample 3
validation_path = Path("src/rl/validation_requests.json")

with open(validation_path, 'r') as f:
    all_requests = json.load(f)

# Sample 3 requests
sample_requests = random.sample(all_requests, 3)

print(f"Loaded {len(all_requests)} validation requests")
print(f"\nSelected 3 sample requests:")
for i, req in enumerate(sample_requests, 1):
    print(f"  {i}. Level: {req['level']}, Grammar: {req['grammar_focus']}, Topic: {req['topic']}")

In [None]:
# Define models to test
models_to_test = [
    "gpt-4o-mini",
    "gpt-3.5-turbo-0125",
    "gpt-3.5-turbo",
    "gpt-4.1-mini"
]

print(f"Models to test: {models_to_test}")

In [None]:
# Create mock exercises for testing
# In a real scenario, you would generate these using your model
def create_mock_exercises(request: Dict) -> List[Dict]:
    """
    Create mock exercises based on the request.
    Replace this with actual model generation in production.
    """
    exercises = []
    num_exercises = request.get("num_exercises", 3)
    topic = request.get("topic", "general")
    grammar_focus = request.get("grammar_focus", "general")
    exercise_types = request.get("exercise_types", ["fill_in_blank"])

    for i in range(num_exercises):
        ex_type = exercise_types[i % len(exercise_types)]

        if ex_type == "fill_in_blank":
            exercise = {
                "type": "fill_in_blank",
                "question": f"La {topic} ___ molto bella. (essere)",
                "correct_answer": "è",
            }
        elif ex_type == "translation":
            exercise = {
                "type": "translation",
                "question": f"Translate to Italian: The {topic} is beautiful.",
                "correct_answer": f"Il {topic} è bello.",
            }
        elif ex_type == "multiple_choice":
            exercise = {
                "type": "multiple_choice",
                "question": f"Quale parola completa la frase? 'Il {topic} ___ interessante.'",
                "correct_answer": "è",
                "options": ["è", "sei", "siamo", "sono"],
            }
        else:
            exercise = {
                "type": "fill_in_blank",
                "question": f"La {topic} ___ molto bella. (essere)",
                "correct_answer": "è",
            }

        exercises.append(exercise)

    return exercises

print("✅ Mock exercise generator ready")

In [None]:
async def benchmark_model(model_name: str, reward_fn: ExerciseRewardFunction, request: Dict, exercises: List[Dict]):
    """
    Benchmark a single model configuration.
    """
    # Set the model override for all scorers
    BaseLLMScorer.set_model_override(model_name)

    # Time the scoring
    start_time = time.time()

    try:
        avg_score, results = await reward_fn.score_exercises(exercises, request)

        elapsed_time = time.time() - start_time

        # Get the breakdown from the first exercise
        first_breakdown = results[0][1] if results else None

        return {
            "model": model_name,
            "avg_score": avg_score,
            "elapsed_time": elapsed_time,
            "breakdown": str(first_breakdown) if first_breakdown else "N/A",
            "all_results": results,
            "success": True,
            "error": None
        }
    except Exception as e:
        elapsed_time = time.time() - start_time
        return {
            "model": model_name,
            "avg_score": 0.0,
            "elapsed_time": elapsed_time,
            "breakdown": "Error",
            "all_results": [],
            "success": False,
            "error": str(e)
        }
    finally:
        # Clear the model override
        BaseLLMScorer.clear_model_override()

print("✅ Benchmark function ready")

In [None]:
# Initialize the reward function
print("Initializing reward function...")
reward_fn = ExerciseRewardFunction(
    spacy_model="it_core_news_sm",
    device="cpu",  # Use CPU for Colab compatibility
    disabled_scorers=[],  # Enable all scorers
    fluency_use_llm=False,  # Disable LLM for fluency to speed up testing
    concurrency_limit=20
)
print("✅ Reward function initialized")

In [None]:
# Run the benchmark
print("="*80)
print("STARTING BENCHMARK")
print("="*80)

all_results = []

for test_idx, request in enumerate(sample_requests, 1):
    print(f"\n{'='*80}")
    print(f"TEST CASE {test_idx}")
    print(f"{'='*80}")
    print(f"Level: {request['level']}")
    print(f"Grammar Focus: {request['grammar_focus']}")
    print(f"Topic: {request['topic']}")
    print(f"Number of Exercises: {request.get('num_exercises', 3)}")
    print(f"Exercise Types: {request.get('exercise_types', [])}")

    # Create mock exercises
    exercises = create_mock_exercises(request)
    print(f"\nGenerated {len(exercises)} mock exercises")

    # Print out the exercises
    print(f"\n{'─'*80}")
    print("EXERCISES:")
    print(f"{'─'*80}")
    for i, ex in enumerate(exercises, 1):
        print(f"\nExercise {i}:")
        print(f"  Type: {ex['type']}")
        print(f"  Question: {ex['question']}")
        print(f"  Correct Answer: {ex['correct_answer']}")
        if 'options' in ex:
            print(f"  Options: {ex['options']}")
    print(f"{'─'*80}\n")

    test_results = {
        "request": request,
        "exercises": exercises,
        "model_results": []
    }

    # Test each model
    for model_name in models_to_test:
        print(f"\n{'-'*80}")
        print(f"Testing model: {model_name}")
        print(f"{'-'*80}")

        result = await benchmark_model(model_name, reward_fn, request, exercises)
        test_results["model_results"].append(result)

        if result["success"]:
            print(f"\n✅ Model: {model_name}")
            print(f"   Elapsed Time: {result['elapsed_time']:.2f}s")
            print(f"   Average Score: {result['avg_score']:.2f}/100")
            print(f"\n   Reward Breakdown (First Exercise):")
            # Indent each line of the breakdown
            for line in result['breakdown'].split('\n'):
                print(f"   {line}")
        else:
            print(f"\n❌ Model: {model_name}")
            print(f"   Elapsed Time: {result['elapsed_time']:.2f}s")
            print(f"   Error: {result['error']}")

    all_results.append(test_results)

print(f"\n{'='*80}")
print("BENCHMARK COMPLETE")
print(f"{'='*80}")

In [None]:
# Summary statistics
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)

# Calculate average time and score per model across all tests
model_stats = {model: {"times": [], "scores": [], "errors": 0} for model in models_to_test}

for test_result in all_results:
    for model_result in test_result["model_results"]:
        model = model_result["model"]
        if model_result["success"]:
            model_stats[model]["times"].append(model_result["elapsed_time"])
            model_stats[model]["scores"].append(model_result["avg_score"])
        else:
            model_stats[model]["errors"] += 1

print(f"\n{'Model':<25} {'Avg Time (s)':<15} {'Avg Score':<15} {'Errors':<10}")
print("-" * 70)

for model in models_to_test:
    stats = model_stats[model]
    avg_time = sum(stats["times"]) / len(stats["times"]) if stats["times"] else 0
    avg_score = sum(stats["scores"]) / len(stats["scores"]) if stats["scores"] else 0
    errors = stats["errors"]

    print(f"{model:<25} {avg_time:<15.2f} {avg_score:<15.2f} {errors:<10}")

print("\n" + "="*80)

In [None]:
# Visualize timing comparison
import matplotlib.pyplot as plt
import numpy as np

# Prepare data for plotting
model_names = []
avg_times = []
avg_scores = []

for model in models_to_test:
    stats = model_stats[model]
    if stats["times"]:
        model_names.append(model)
        avg_times.append(sum(stats["times"]) / len(stats["times"]))
        avg_scores.append(sum(stats["scores"]) / len(stats["scores"]))

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Average inference time
ax1.bar(range(len(model_names)), avg_times, color='skyblue')
ax1.set_xlabel('Model')
ax1.set_ylabel('Average Time (seconds)')
ax1.set_title('Average Inference Time per Model')
ax1.set_xticks(range(len(model_names)))
ax1.set_xticklabels(model_names, rotation=45, ha='right')

# Add value labels on bars
for i, v in enumerate(avg_times):
    ax1.text(i, v + 0.1, f'{v:.2f}s', ha='center', va='bottom')

# Plot 2: Average score
ax2.bar(range(len(model_names)), avg_scores, color='lightcoral')
ax2.set_xlabel('Model')
ax2.set_ylabel('Average Score (out of 100)')
ax2.set_title('Average Score per Model')
ax2.set_xticks(range(len(model_names)))
ax2.set_xticklabels(model_names, rotation=45, ha='right')
ax2.set_ylim([0, 100])

# Add value labels on bars
for i, v in enumerate(avg_scores):
    ax2.text(i, v + 1, f'{v:.2f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("✅ Visualization complete")

## Notes

This notebook provides a baseline benchmark for comparing different LLM models.

**Key findings:**
- Inference time varies by model
- Score consistency across models indicates scoring stability
- Mock exercises are used for testing; replace with actual model generation for production benchmarks

**Configuration:**
- Model configurations are defined in `src/rl/reward_function/scorers/base_llm_scorer.py`
- All scorers now default to gpt-4.1-mini, with fallback to gpt-4o-mini, then turbo models