In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Navigate to project
%cd /content/drive/MyDrive/Colab\ Notebooks/italian_teacher

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/italian_teacher


In [2]:
# Install dependencies
!pip install -q transformers trl accelerate peft datasets spacy sentence-transformers bitsandbytes json5 openai tqdm nest_asyncio
!python -m spacy download it_core_news_sm

Collecting it-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.8.0/it_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
try:
    from .scorers import (
        CEFRScorer,
        CoherenceScorer,
        ExerciseQualityScorer,
        FluencyScorer,
        GrammarScorer,
        JSONScorer,
        LinguisticScorer,
        TopicScorer,
    )
except ImportError:
    from src.rl.reward_function.scorers import (
        CEFRScorer,
        CoherenceScorer,
        ExerciseQualityScorer,
        FluencyScorer,
        GrammarScorer,
        JSONScorer,
        LinguisticScorer,
        TopicScorer,
    )

In [None]:
import os
from getpass import getpass

# You can enable/disable OpenAI here:
USE_OPENAI = True  # Set to False for faster training without OpenAI

if USE_OPENAI:
    if "OPENAI_API_KEY" not in os.environ:
        OPENAI_API_KEY = ""
        os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    print("✅ OpenAI API enabled - Professional quality with async batching")
    print("   OPTIMIZED: Samples 1 exercise/completion (70% reduction in API calls)")
    print("   Expected training time: ~2-3 hours")
else:
    if "OPENAI_API_KEY" in os.environ:
        del os.environ["OPENAI_API_KEY"]
    print("✅ OpenAI API disabled - Fast rule-based rewards")
    print("   Expected training time: ~60-90 min")

✅ OpenAI API enabled - Professional quality with async batching
   OPTIMIZED: Samples 1 exercise/completion (70% reduction in API calls)
   Expected training time: ~2-3 hours


In [5]:
# Stress Test Notebook for Async Reward Function

import asyncio
import os
import time
import sys
from pathlib import Path
import torch
import traceback
import httpx
import random

# --- 1. SETUP ---
# Add your project's root directory to the Python path
# Adjust this path if your notebook is in a different location
try:
    # Assumes notebook is in the root of the 'italian_teacher' project
    project_root = Path.cwd()
    src_path = project_root / "src"
    if str(src_path) not in sys.path:
        sys.path.insert(0, str(src_path))
    print(f"✅ Added '{src_path}' to Python path")

    from rl.multi_reward_async import AsyncMultiReward
    from rl.reward_function.reward_function_modular import ExerciseRewardFunction
    print("✅ Successfully imported reward function components.")

except ImportError as e:
    print("\n❌ FAILED TO IMPORT PROJECT MODULES.")
    print("Please ensure that your 'src' directory is accessible from this notebook.")
    print(f"Error: {e}")
    # Stop execution if imports fail
    raise

# Check for OpenAI API Key
if not os.environ.get("OPENAI_API_KEY"):
    print("\n❌ ERROR: OPENAI_API_KEY environment variable not set.")
    print("The stress test cannot run without an API key.")
    # Stop execution
    raise ValueError("OPENAI_API_KEY not set")
else:
    print("✅ Found OPENAI_API_KEY.")


# --- 2. MOCK DATA ---
# We'll create a realistic set of mock data to simulate the trainer's output.
NUM_SAMPLES = 64  # A good number to stress the API

print(f"\nGenerating {NUM_SAMPLES} mock data samples...")

# Mock completions (a mix of good and slightly malformed JSON)
mock_completions = [
    """
[
  {"type": "fill_in_blank", "question": "Ieri, io ___ (andare) al cinema.", "correct_answer": "sono andato"},
  {"type": "translation", "question": "The cat is on the table.", "correct_answer": "Il gatto è sul tavolo."}
]
    """
] * NUM_SAMPLES

# Mock requests that correspond to the completions
mock_requests = [
    {"topic": "Daily Life", "level": "A2", "grammar_focus": "past_tense"}
] * NUM_SAMPLES

print("✅ Mock data ready.")


# --- 3. CORE TEST FUNCTION ---

# We need to slightly modify the reward function to count errors instead of just printing them
class StressTestReward(AsyncMultiReward):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.timeout_errors = 0

    async def _check_coherence_async(self, *args, **kwargs):
        try:
            return await super()._check_coherence_async(*args, **kwargs)
        except (asyncio.TimeoutError, httpx.TimeoutException):
            self.timeout_errors += 1
            return 0.0 # Return a penalty score on timeout

# We also need to modify the grammar scorer to count errors
# This is a bit more involved due to the class structure, so we'll patch it.
original_check_grammar_with_llm = GrammarScorer._check_grammar_with_llm
async def patched_check_grammar_with_llm(self, *args, **kwargs):
    try:
        return await original_check_grammar_with_llm(self, *args, **kwargs)
    except (asyncio.TimeoutError, httpx.TimeoutException) as e:
        # This is a bit of a hack to access the parent's error counter
        # In a real app, you'd use a more robust logging/metrics system
        if hasattr(self, '_reward_function_instance_for_test'):
             self._reward_function_instance_for_test.timeout_errors += 1
        print(f"  ⚠️ (Patched) LLM grammar check timed out: {e}")
        return 5.0, [f"LLM grammar check timed out: {e}"]

GrammarScorer._check_grammar_with_llm = patched_check_grammar_with_llm


async def run_stress_test(concurrency_limit: int, client_timeout: int):
    """
    Runs a single stress test with a given concurrency limit and timeout.
    """
    print("-" * 80)
    print(f"🚀 Starting Test: Concurrency = {concurrency_limit}, Client Timeout = {client_timeout}s")
    print("-" * 80)

    # Instantiate the base reward function (loads models, etc.)
    # We only do this once to be efficient
    if 'base_reward_fn' not in globals():
        print("Instantiating ExerciseRewardFunction (one-time setup)...")
        global base_reward_fn
        base_reward_fn = ExerciseRewardFunction()
        print("✅ Base reward function ready.")

    # Instantiate our test reward function
    reward_func = StressTestReward(
        reward_fn=base_reward_fn,
        use_openai=True,
        openai_batch_size=concurrency_limit,
        openai_timeout=client_timeout
    )

    # This is the hack to allow the patched grammar scorer to count errors
    base_reward_fn.scorers['grammar']._reward_function_instance_for_test = reward_func

    start_time = time.time()
    try:
        # We call `_process_batch` directly to test the core async logic
        rewards = await reward_func._process_batch(mock_completions, mock_requests)
        elapsed = time.time() - start_time

        print("\n--- Test Results ---")
        print(f"✅ Test completed successfully.")
        print(f"⏱️  Total Time: {elapsed:.2f} seconds")
        print(f"📦 Samples Processed: {len(rewards)}")
        print(f"❌ Timeout Errors: {reward_func.timeout_errors}")

        avg_time_per_sample = elapsed / len(rewards) if rewards else 0
        print(f"⚙️  Avg Time/Sample: {avg_time_per_sample:.2f} seconds")

        if reward_func.timeout_errors > 0:
            print(f"\n⚠️ WARNING: Encountered {reward_func.timeout_errors} timeout errors. The concurrency limit of {concurrency_limit} is likely too high.")
        else:
            print(f"\n👍 SUCCESS: No timeout errors. Concurrency limit of {concurrency_limit} appears stable.")

    except Exception as e:
        elapsed = time.time() - start_time
        print(f"\n❌ TEST FAILED after {elapsed:.2f} seconds.")
        print("An unexpected error occurred during the test:")
        traceback.print_exc()

    return reward_func.timeout_errors, elapsed


# --- 4. RUN THE EXPERIMENTS ---

async def main():
    print("\n" + "="*80)
    print("🔬 RUNNING STRESS TEST SUITE")
    print("="*80)

    # --- Test 2: Finding the Breaking Point ---
    print("\n\n--- TEST 2: FINDING THE BREAKING POINT ---")
    print("We will now increase concurrency to find your account's stable limit.")
    concurrency_levels = [4, 16, 32]
    results = {}
    for level in concurrency_levels:
        errors, elapsed = await run_stress_test(concurrency_limit=level, client_timeout=60)
        results[level] = (errors, elapsed)

    # --- Final Analysis ---
    print("\n" + "="*80)
    print("📊 FINAL ANALYSIS & RECOMMENDATION")
    print("="*80)

    print("Concurrency Level | Timeout Errors | Total Time (s)")
    print("------------------|----------------|---------------")
    for level, (errors, elapsed) in results.items():
        print(f"{level:<17} | {errors:<14} | {elapsed:<14.2f}")

    stable_limit = 0
    for level, (errors, elapsed) in results.items():
        if errors == 0:
            stable_limit = level
        else:
            # The first level with errors is the breaking point
            break

    if stable_limit > 0:
        print(f"\n✅ Recommendation: Your maximum stable concurrency limit appears to be around {stable_limit}.")
        print(f"   Set `openai_batch_size = {stable_limit}` in your training script for the best balance of speed and stability.")
    else:
        print("\n⚠️ Warning: Timeouts occurred even at the lowest tested concurrency level (5).")
        print("   This may indicate a very strict rate limit on your OpenAI account or a network issue.")
        print("   Try setting `openai_batch_size = 2` in your training script.")

# Run the main async function
if __name__ == "__main__":
    # Using nest_asyncio to allow running asyncio.run() in a notebook
    try:
        import nest_asyncio
        nest_asyncio.apply()
        print("\nApplied nest_asyncio patch for notebook compatibility.")
    except ImportError:
        pass

    asyncio.run(main())



✅ Added '/content/drive/MyDrive/Colab Notebooks/italian_teacher/src' to Python path
✅ Successfully imported reward function components.
✅ Found OPENAI_API_KEY.

Generating 64 mock data samples...
✅ Mock data ready.

Applied nest_asyncio patch for notebook compatibility.

🔬 RUNNING STRESS TEST SUITE


--- TEST 2: FINDING THE BREAKING POINT ---
We will now increase concurrency to find your account's stable limit.
--------------------------------------------------------------------------------
🚀 Starting Test: Concurrency = 4, Client Timeout = 60s
--------------------------------------------------------------------------------
Instantiating ExerciseRewardFunction (one-time setup)...
Loading spaCy model: it_core_news_sm...
✅ spaCy model loaded
Reward function will use device: cpu
Initializing scorers...
Pre-loading CEFR vocabulary (16,887 words)...
✅ Loaded 16887 Italian words from vocabulary list
✅ Loaded vocabulary for all CEFR levels
  ✅ OpenAI validation enabled for B2+ levels
  ✅ LLM 

                                                    

.. Populating work queue with OpenAI tasks (coherence, grammar)...
⏳ Step 2/3: Processing 192 OpenAI tasks with 4 parallel workers...




⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...





🎯 Multi-Reward (Async OpenAI, batch=4, 78.9s):
   Grammar   : min=0.583, max=0.583, avg=0.583 (weight=2.0)
   Coherence : min=1.000, max=1.000, avg=1.000 (weight=2.5)
   Topic     : min=0.150, max=0.150, avg=0.150 (weight=1.5)
   Quality   : min=0.870, max=0.870, avg=0.870 (weight=1.0)
   Diversity : min=1.000, max=1.000, avg=1.000 (weight=0.5)
   TOTAL     : min=5.262, max=5.262, avg=5.262

--- Test Results ---
✅ Test completed successfully.
⏱️  Total Time: 78.95 seconds
📦 Samples Processed: 64
❌ Timeout Errors: 0
⚙️  Avg Time/Sample: 1.23 seconds

👍 SUCCESS: No timeout errors. Concurrency limit of 4 appears stable.
--------------------------------------------------------------------------------
🚀 Starting Test: Concurrency = 16, Client Timeout = 60s
--------------------------------------------------------------------------------

⏳ Step 1/3: Parsing 64 JSON completions...




.. Populating work queue with OpenAI tasks (coherence, grammar)...
⏳ Step 2/3: Processing 192 OpenAI tasks with 16 parallel workers...
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...





🎯 Multi-Reward (Async OpenAI, batch=16, 56.9s):
   Grammar   : min=0.583, max=0.583, avg=0.583 (weight=2.0)
   Coherence : min=1.000, max=1.000, avg=1.000 (weight=2.5)
   Topic     : min=0.150, max=0.150, avg=0.150 (weight=1.5)
   Quality   : min=0.870, max=0.870, avg=0.870 (weight=1.0)
   Diversity : min=1.000, max=1.000, avg=1.000 (weight=0.5)
   TOTAL     : min=5.262, max=5.262, avg=5.262

--- Test Results ---
✅ Test completed successfully.
⏱️  Total Time: 56.89 seconds
📦 Samples Processed: 64
❌ Timeout Errors: 0
⚙️  Avg Time/Sample: 0.89 seconds

👍 SUCCESS: No timeout errors. Concurrency limit of 16 appears stable.
--------------------------------------------------------------------------------
🚀 Starting Test: Concurrency = 32, Client Timeout = 60s
--------------------------------------------------------------------------------

⏳ Step 1/3: Parsing 64 JSON completions...




.. Populating work queue with OpenAI tasks (coherence, grammar)...
⏳ Step 2/3: Processing 192 OpenAI tasks with 32 parallel workers...
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                                    


🎯 Multi-Reward (Async OpenAI, batch=32, 58.3s):
   Grammar   : min=0.583, max=0.583, avg=0.583 (weight=2.0)
   Coherence : min=1.000, max=1.000, avg=1.000 (weight=2.5)
   Topic     : min=0.150, max=0.150, avg=0.150 (weight=1.5)
   Quality   : min=0.870, max=0.870, avg=0.870 (weight=1.0)
   Diversity : min=1.000, max=1.000, avg=1.000 (weight=0.5)
   TOTAL     : min=5.262, max=5.262, avg=5.262

--- Test Results ---
✅ Test completed successfully.
⏱️  Total Time: 58.33 seconds
📦 Samples Processed: 64
❌ Timeout Errors: 0
⚙️  Avg Time/Sample: 0.91 seconds

👍 SUCCESS: No timeout errors. Concurrency limit of 32 appears stable.

📊 FINAL ANALYSIS & RECOMMENDATION
Concurrency Level | Timeout Errors | Total Time (s)
------------------|----------------|---------------
4                 | 0              | 78.95         
16                | 0              | 56.89         
32                | 0              | 58.33         

✅ Recommendation: Your maximum stable concurrency limit appears to be around

