In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Navigate to project
%cd /content/drive/MyDrive/Colab\ Notebooks/italian_teacher

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/italian_teacher


In [2]:
# Install dependencies
!pip install -q transformers trl accelerate peft datasets spacy sentence-transformers bitsandbytes json5 openai tqdm nest_asyncio
!python -m spacy download it_core_news_sm

Collecting it-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.8.0/it_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m78.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
# ==============================================================================
# CELL 1: SETUP AND IMPORTS
# ==============================================================================
import asyncio
import os
import time
import sys
from pathlib import Path
import torch
import traceback
import httpx
import random
import json

print("--- 1. SETTING UP ENVIRONMENT ---")

# --- Add project to Python path ---
# Assumes notebook is in the root of the 'italian_teacher' project
try:
    project_root = Path.cwd()
    src_path = project_root / "src"
    if str(src_path) not in sys.path:
        sys.path.insert(0, str(src_path))
    print(f"✅ Added '{src_path}' to Python path")

    from rl.reward_function.reward_function_modular import ExerciseRewardFunction
    from rl.reward_function.scorers.base_llm_scorer import BaseLLMScorer
    from rl.reward_function.scorers.grammar_scorer import GrammarScorer
    from rl.reward_function.scorers.cefr_scorer import CEFRScorer
    from rl.reward_function.scorers.coherence_scorer import CoherenceScorer
    print("✅ Successfully imported reward function components.")

except ImportError as e:
    print("\n❌ FAILED TO IMPORT PROJECT MODULES.")
    print("Please ensure that your 'src' directory is accessible from this notebook.")
    print(f"Error: {e}")
    raise

--- 1. SETTING UP ENVIRONMENT ---
✅ Added '/content/drive/MyDrive/Colab Notebooks/italian_teacher/src' to Python path
✅ Successfully imported reward function components.


In [4]:
import os
from getpass import getpass

# You can enable/disable OpenAI here:
USE_OPENAI = True  # Set to False for faster training without OpenAI

OPENAI_API_KEY = getpass("Enter your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

Enter your OpenAI API key: ··········


In [5]:


# ==============================================================================
# CELL 2: STRESS TEST CLASSES AND MOCK DATA
# ==============================================================================

print("\n--- 2. DEFINING STRESS TEST COMPONENTS ---")

class TestState:
    """A simple, thread-safe class to hold shared test state like error counts."""
    def __init__(self):
        self.timeout_errors = 0
        self.lock = asyncio.Lock()

    async def increment_errors(self):
        async with self.lock:
            self.timeout_errors += 1

def create_stress_test_scorer(ScorerClass, test_state):
    """A factory to create StressTest versions of our LLM scorers."""
    class StressTestScorer(ScorerClass):
        def __init__(self, *args, **kwargs):
            # Pass original args to the parent, but don't re-print the init message
            # by temporarily redirecting stdout
            original_stdout = sys.stdout
            sys.stdout = open(os.devnull, 'w')
            super().__init__(*args, **kwargs)
            sys.stdout = original_stdout
            self.test_state = test_state

        async def score_batch(self, exercises, request, semaphore=None):
            try:
                # Call the original, real score_batch method
                return await super().score_batch(exercises, request, semaphore)
            except (httpx.TimeoutException, asyncio.TimeoutError) as e:
                # If it times out, increment the shared error counter
                await self.test_state.increment_errors()
                print(f"  🔥 TIMEOUT DETECTED in {self.name} scorer.")
                # Return a neutral/penalty score for each exercise in the batch
                return [(5.0, [f"{self.name} timed out"])] * len(exercises)

    return StressTestScorer

# --- Mock Data Generation ---
NUM_SAMPLES = 64  # A good number to stress the API
EXERCISES_PER_SAMPLE = 3
print(f"Generating {NUM_SAMPLES} mock data samples ({EXERCISES_PER_SAMPLE} exercises each)...")

mock_exercises_batch = [
    {"type": "fill_in_blank", "question": "Ieri, io ___ (andare) al cinema.", "correct_answer": "sono andato"},
    {"type": "translation", "question": "The cat is on the table.", "correct_answer": "Il gatto è sul tavolo."},
    {"type": "multiple_choice", "question": "Noi ___ la pizza.", "correct_answer": "mangiamo", "options": ["mangi", "mangia", "mangiamo", "mangiano"]}
]
mock_request = {"topic": "Daily Life", "level": "A2", "grammar_focus": "past_tense and present_tense"}

print("✅ Mock data ready.")


--- 2. DEFINING STRESS TEST COMPONENTS ---
Generating 64 mock data samples (3 exercises each)...
✅ Mock data ready.


In [6]:
# ==============================================================================
# CELL 3: CORE TEST FUNCTION (Corrected)
# ==============================================================================

print("\n--- 3. DEFINING CORE TEST LOGIC ---")

async def run_stress_test(concurrency_limit: int, client_timeout: int):
    """
    Runs a single stress test with a given concurrency limit and timeout.
    """
    print("-" * 80)
    print(f"🚀 Starting Test: Concurrency = {concurrency_limit}, Client Timeout = {client_timeout}s")
    print("-" * 80)

    # This object will be shared across all scorer instances to count errors
    test_state = TestState()

    # Create a temporary reward function instance for this test run
    # This also loads the spaCy model into `reward_fn.nlp` for the scorers that still use it.
    reward_fn = ExerciseRewardFunction()

    # --- Create and Inject Test Scorers ---
    # This is the core of the new approach: we replace the real scorers
    # with our special test versions that can count timeouts.
    StressTestGrammarScorer = create_stress_test_scorer(GrammarScorer, test_state)
    StressTestCEFRScorer = create_stress_test_scorer(CEFRScorer, test_state)
    StressTestCoherenceScorer = create_stress_test_scorer(CoherenceScorer, test_state)

    # Correctly instantiate each test scorer.
    # The LLM-based scorers no longer need the `nlp` object.
    reward_fn.scorers['grammar'] = StressTestGrammarScorer()
    reward_fn.scorers['cefr'] = StressTestCEFRScorer()
    reward_fn.scorers['coherence'] = StressTestCoherenceScorer()

    # The semaphore controls the total number of concurrent requests to the OpenAI API
    semaphore = asyncio.Semaphore(concurrency_limit)

    start_time = time.time()
    try:
        # Create a list of tasks. Each task scores one batch of mock exercises.
        # This simulates the I/O-bound part of the main reward function.
        io_tasks = []
        for _ in range(NUM_SAMPLES):
            # For each "sample", we simulate scoring the batch of exercises with each LLM scorer.
            io_tasks.append(reward_fn.scorers['grammar'].score_batch(mock_exercises_batch, mock_request, semaphore))
            io_tasks.append(reward_fn.scorers['cefr'].score_batch(mock_exercises_batch, mock_request, semaphore))
            io_tasks.append(reward_fn.scorers['coherence'].score_batch(mock_exercises_batch, mock_request, semaphore))

        # Run all tasks concurrently
        results = await asyncio.gather(*io_tasks, return_exceptions=True)
        elapsed = time.time() - start_time

        # Check for any unexpected exceptions that weren't timeouts
        for result in results:
            if isinstance(result, Exception):
                print(f"  🚨 Unexpected error during gather: {result}")
                traceback.print_exception(type(result), result, result.__traceback__)

        print("\n--- Test Results ---")
        print(f"✅ Test completed.")
        print(f"⏱️  Total Time: {elapsed:.2f} seconds")
        print(f"📦 Batches Processed: {NUM_SAMPLES}")
        print(f"🔥 Total API Calls Simulated: {len(io_tasks)}")
        print(f"❌ Timeout Errors Detected: {test_state.timeout_errors}")

        avg_time_per_batch = elapsed / NUM_SAMPLES if NUM_SAMPLES else 0
        print(f"⚙️  Avg Time/Sample Batch: {avg_time_per_batch:.2f} seconds")

        if test_state.timeout_errors > 0:
            print(f"\n⚠️ WARNING: Encountered {test_state.timeout_errors} timeout errors. The concurrency limit of {concurrency_limit} is likely too high.")
        else:
            print(f"\n👍 SUCCESS: No timeout errors. Concurrency limit of {concurrency_limit} appears stable.")

    except Exception:
        elapsed = time.time() - start_time
        print(f"\n❌ TEST FAILED after {elapsed:.2f} seconds.")
        print("An unexpected error occurred during the test:")
        traceback.print_exc()

    return test_state.timeout_errors, elapsed



--- 3. DEFINING CORE TEST LOGIC ---


In [7]:


async def main():
    print("\n" + "="*80)
    print("🔬 RUNNING STRESS TEST SUITE")
    print("="*80)

    # Define the concurrency levels you want to test
    concurrency_levels = [5, 10, 20, 40]
    results = {}

    for level in concurrency_levels:
        errors, elapsed = await run_stress_test(concurrency_limit=level, client_timeout=60)
        results[level] = (errors, elapsed)
        # Add a small delay between tests to let any network/API issues settle
        await asyncio.sleep(2)

    # --- Final Analysis ---
    print("\n" + "="*80)
    print("📊 FINAL ANALYSIS & RECOMMENDATION")
    print("="*80)

    print("Concurrency Level | Timeout Errors | Total Time (s)")
    print("------------------|----------------|---------------")
    for level, (errors, elapsed) in results.items():
        print(f"{level:<17} | {errors:<14} | {elapsed:<14.2f}")

    stable_limit = 0
    for level, (errors, elapsed) in results.items():
        if errors == 0:
            stable_limit = level
        else:
            # The first level with errors is the breaking point
            break

    if stable_limit > 0:
        print(f"\n✅ Recommendation: Your maximum stable concurrency limit appears to be around {stable_limit}.")
        print(f"   Set `openai_batch_size = {stable_limit}` in your training script for the best balance of speed and stability.")
    else:
        print("\n⚠️ Warning: Timeouts occurred even at the lowest tested concurrency level.")
        print("   This may indicate a very strict rate limit on your OpenAI account or a network issue.")
        print("   Try setting `openai_batch_size` to a lower value like 2 or 4 in your training script.")

# --- Run the main async function ---
# Using nest_asyncio to allow running asyncio.run() in a notebook
try:
    import nest_asyncio
    nest_asyncio.apply()
    print("\nApplied nest_asyncio patch for notebook compatibility.")
except ImportError:
    pass

# This is the entry point that kicks off the entire test suite.
asyncio.run(main())



Applied nest_asyncio patch for notebook compatibility.

🔬 RUNNING STRESS TEST SUITE
--------------------------------------------------------------------------------
🚀 Starting Test: Concurrency = 5, Client Timeout = 60s
--------------------------------------------------------------------------------
Loading spaCy model: it_core_news_sm...
✅ spaCy model loaded
Reward function will use device: cpu
Initializing scorers...
  ✅ LLM scoring enabled for cefr_alignment (batch size: 10)
  ✅ LLM fluency checking enabled (OpenAI API)
  ✅ LLM scoring enabled for grammar_correctness (batch size: 10)
  ✅ LLM scoring enabled for coherence (batch size: 10)
Loading sentence transformer for topic similarity...
✅ Sentence transformer loaded in cpu
  ✅ LLM topic checking enabled (OpenAI API)
✅ Reward function initialized with 8 professional scorers (including coherence)

--- Test Results ---
✅ Test completed.
⏱️  Total Time: 89.98 seconds
📦 Batches Processed: 64
🔥 Total API Calls Simulated: 192
❌ Timeout