In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Navigate to project
%cd /content/drive/MyDrive/Colab\ Notebooks/italian_teacher

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/italian_teacher


In [None]:
# Install dependencies
!pip install -q transformers trl accelerate peft datasets spacy sentence-transformers bitsandbytes json5 google-generativeai tqdm nest_asyncio
!python -m spacy download it_core_news_sm

Collecting it-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.8.0/it_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# ==============================================================================
# CELL 1: SETUP AND IMPORTS
# ==============================================================================
import asyncio
import os
import time
import sys
from pathlib import Path
import torch
import traceback
import random
import json

print("--- 1. SETTING UP ENVIRONMENT ---")

# --- Add project to Python path ---
try:
    project_root = Path.cwd()
    src_path = project_root / "src"
    if str(src_path) not in sys.path:
        sys.path.insert(0, str(src_path))
    print(f"✅ Added '{src_path}' to Python path")

    from rl.reward_function.reward_function_modular import ExerciseRewardFunction
    from rl.reward_function.scorers.base_llm_scorer import BaseLLMScorer
    from rl.reward_function.scorers.grammar_scorer import GrammarScorer
    from rl.reward_function.scorers.cefr_scorer import CEFRScorer
    from rl.reward_function.scorers.coherence_scorer import CoherenceScorer
    print("✅ Successfully imported reward function components.")

except ImportError as e:
    print("\n❌ FAILED TO IMPORT PROJECT MODULES.")
    print("Please ensure that your 'src' directory is accessible from this notebook.")
    print(f"Error: {e}")
    raise

--- 1. SETTING UP ENVIRONMENT ---
✅ Added '/content/drive/MyDrive/Colab Notebooks/italian_teacher/src' to Python path
✅ Successfully imported reward function components.


In [None]:
import os
from getpass import getpass

# Configure Gemini API
USE_GEMINI = True

# Try to get from Colab secrets first
GOOGLE_API_KEY = None
try:
    from google.colab import userdata
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    print("✅ Found GOOGLE_API_KEY in Colab secrets")
except Exception:
    pass

if not GOOGLE_API_KEY:
    GOOGLE_API_KEY = getpass("Enter your Google API key: ")

os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
print("✅ Google API key configured")

Enter your Google API key: ··········
✅ Google API key configured


In [None]:
# ==============================================================================
# CELL 2: STRESS TEST CLASSES AND MOCK DATA
# ==============================================================================

print("\n--- 2. DEFINING STRESS TEST COMPONENTS ---")

class TestState:
    """A simple, thread-safe class to hold shared test state."""
    def __init__(self):
        self.timeout_errors = 0
        self.error_503 = 0
        self.error_429 = 0
        self.error_other = 0
        self.success_count = 0
        self.lock = asyncio.Lock()

    async def increment_timeout(self):
        async with self.lock:
            self.timeout_errors += 1

    async def increment_503(self):
        async with self.lock:
            self.error_503 += 1

    async def increment_429(self):
        async with self.lock:
            self.error_429 += 1

    async def increment_other(self):
        async with self.lock:
            self.error_other += 1

    async def increment_success(self):
        async with self.lock:
            self.success_count += 1

def create_stress_test_scorer(ScorerClass, test_state):
    """A factory to create StressTest versions of our LLM scorers."""
    class StressTestScorer(ScorerClass):
        def __init__(self, *args, **kwargs):
            # Silence the init message
            original_stdout = sys.stdout
            sys.stdout = open(os.devnull, 'w')
            super().__init__(*args, **kwargs)
            sys.stdout = original_stdout
            self.test_state = test_state

        async def score_batch(self, exercises, request, semaphore=None):
            try:
                result = await super().score_batch(exercises, request, semaphore)
                await self.test_state.increment_success()
                return result
            except asyncio.TimeoutError as e:
                await self.test_state.increment_timeout()
                print(f"  🔥 TIMEOUT in {self.name} scorer")
                return [(5.0, [f"{self.name} timed out"])] * len(exercises)
            except Exception as e:
                error_msg = str(e).lower()
                if "503" in error_msg or "service unavailable" in error_msg:
                    await self.test_state.increment_503()
                    print(f"  🔥 503 ERROR in {self.name} scorer: {str(e)[:100]}")
                elif "429" in error_msg or "rate limit" in error_msg or "quota" in error_msg:
                    await self.test_state.increment_429()
                    print(f"  🔥 RATE LIMIT in {self.name} scorer")
                else:
                    await self.test_state.increment_other()
                    print(f"  🔥 ERROR in {self.name} scorer: {str(e)[:100]}")
                return [(5.0, [f"{self.name} failed"])] * len(exercises)

    return StressTestScorer

# --- Mock Data Generation ---
NUM_SAMPLES = 32  # Start smaller for Gemini testing
EXERCISES_PER_SAMPLE = 3
print(f"Generating {NUM_SAMPLES} mock data samples ({EXERCISES_PER_SAMPLE} exercises each)...")

mock_exercises_batch = [
    {"type": "fill_in_blank", "question": "Ieri, io ___ (andare) al cinema.", "correct_answer": "sono andato"},
    {"type": "translation", "question": "The cat is on the table.", "correct_answer": "Il gatto è sul tavolo."},
    {"type": "multiple_choice", "question": "Noi ___ la pizza.", "correct_answer": "mangiamo", "options": ["mangi", "mangia", "mangiamo", "mangiano"]}
]
mock_request = {"topic": "Daily Life", "level": "A2", "grammar_focus": "past_tense and present_tense"}

print("✅ Mock data ready.")


--- 2. DEFINING STRESS TEST COMPONENTS ---
Generating 32 mock data samples (3 exercises each)...
✅ Mock data ready.


In [None]:
# ==============================================================================
# CELL 3: CORE TEST FUNCTION
# ==============================================================================

print("\n--- 3. DEFINING CORE TEST LOGIC ---")

async def run_stress_test(concurrency_limit: int, timeout: int, test_name: str = ""):
    """
    Runs a single stress test with a given concurrency limit and timeout.
    """
    print("-" * 80)
    print(f"🚀 Starting Test: {test_name}")
    print(f"   Concurrency = {concurrency_limit}, Timeout = {timeout}s")
    print("-" * 80)

    test_state = TestState()

    # Create a temporary reward function instance
    reward_fn = ExerciseRewardFunction()

    # Create and inject test scorers
    StressTestGrammarScorer = create_stress_test_scorer(GrammarScorer, test_state)
    StressTestCEFRScorer = create_stress_test_scorer(CEFRScorer, test_state)
    StressTestCoherenceScorer = create_stress_test_scorer(CoherenceScorer, test_state)

    reward_fn.scorers['grammar'] = StressTestGrammarScorer()
    reward_fn.scorers['cefr'] = StressTestCEFRScorer()
    reward_fn.scorers['coherence'] = StressTestCoherenceScorer()

    semaphore = asyncio.Semaphore(concurrency_limit)

    start_time = time.time()
    try:
        io_tasks = []
        for _ in range(NUM_SAMPLES):
            io_tasks.append(reward_fn.scorers['grammar'].score_batch(mock_exercises_batch, mock_request, semaphore))
            io_tasks.append(reward_fn.scorers['cefr'].score_batch(mock_exercises_batch, mock_request, semaphore))
            io_tasks.append(reward_fn.scorers['coherence'].score_batch(mock_exercises_batch, mock_request, semaphore))

        results = await asyncio.gather(*io_tasks, return_exceptions=True)
        elapsed = time.time() - start_time

        # Check for unexpected exceptions
        for result in results:
            if isinstance(result, Exception):
                print(f"  🚨 Unexpected error during gather: {result}")

        print("\n--- Test Results ---")
        print(f"✅ Test completed.")
        print(f"⏱️  Total Time: {elapsed:.2f} seconds")
        print(f"📦 Batches Processed: {NUM_SAMPLES}")
        print(f"🔥 Total API Calls: {len(io_tasks)}")
        print(f"\n--- Error Breakdown ---")
        print(f"✅ Successful calls: {test_state.success_count}")
        print(f"⏱️  Timeout errors: {test_state.timeout_errors}")
        print(f"🚫 503 (Service Unavailable): {test_state.error_503}")
        print(f"⚠️  429 (Rate Limit): {test_state.error_429}")
        print(f"❌ Other errors: {test_state.error_other}")

        avg_time_per_batch = elapsed / NUM_SAMPLES if NUM_SAMPLES else 0
        print(f"\n⚙️  Avg Time/Sample Batch: {avg_time_per_batch:.2f} seconds")

        total_errors = test_state.timeout_errors + test_state.error_503 + test_state.error_429 + test_state.error_other
        if total_errors > 0:
            print(f"\n⚠️ WARNING: Encountered {total_errors} total errors.")
            if test_state.error_503 > 0:
                print(f"   🚫 {test_state.error_503} 503 errors - Gemini API may be overloaded or having issues")
            if test_state.error_429 > 0:
                print(f"   ⚠️  {test_state.error_429} rate limit errors - reduce concurrency")
            if test_state.timeout_errors > 0:
                print(f"   ⏱️  {test_state.timeout_errors} timeout errors - increase timeout or reduce batch size")
        else:
            print(f"\n👍 SUCCESS: No errors. Configuration appears stable.")

    except Exception:
        elapsed = time.time() - start_time
        print(f"\n❌ TEST FAILED after {elapsed:.2f} seconds.")
        print("An unexpected error occurred during the test:")
        traceback.print_exc()

    return test_state, elapsed


--- 3. DEFINING CORE TEST LOGIC ---


In [None]:
# ==============================================================================
# CELL 4: BASIC GEMINI CONNECTION TEST
# ==============================================================================

print("\n" + "="*80)
print("🧪 TEST 1: BASIC GEMINI API CONNECTION")
print("="*80)
print("Testing if Gemini API is accessible and working...\n")

import google.generativeai as genai

genai.configure(api_key=os.environ.get('GOOGLE_API_KEY'))

# Test with a simple prompt (no structured output)
try:
    model = genai.GenerativeModel('gemini-2.0-flash-exp')
    response = model.generate_content("Say 'Hello, Gemini is working!' in Italian.")
    print(f"✅ Basic API test successful!")
    print(f"Response: {response.text}")
except Exception as e:
    print(f"❌ Basic API test failed: {e}")
    print("\nThis suggests a fundamental issue with your API key or Gemini access.")
    print("Please verify your API key and billing setup.")


🧪 TEST 1: BASIC GEMINI API CONNECTION
Testing if Gemini API is accessible and working...

✅ Basic API test successful!
Response: The most natural way to say "Hello, Gemini is working!" in Italian would be:

**"Ciao, Gemini funziona!"**

Here's a breakdown:

*   **Ciao:** Hello (informal)
*   **Gemini:** Gemini (the AI model)
*   **Funziona:** works, is working

Alternatively, you could say:

*   **"Ciao, Gemini sta funzionando!"** (Ciao, Gemini is functioning!) - This is more literal, but sounds a bit more formal. "Sta funzionando" is the present continuous of the verb "funzionare".



In [None]:
# ==============================================================================
# CELL 5: STRUCTURED OUTPUT TEST
# ==============================================================================

print("\n" + "="*80)
print("🧪 TEST 2: GEMINI STRUCTURED OUTPUT (JSON SCHEMA)")
print("="*80)
print("Testing if Gemini can handle our JSON schema for scoring...\n")

# This is similar to what our scorer uses
test_schema = {
    "type": "object",
    "properties": {
        "scores": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "score": {"type": "number"},
                    "issue": {"type": "string"}
                },
                "required": ["score", "issue"]
            }
        }
    },
    "required": ["scores"]
}

try:
    model = genai.GenerativeModel(
        model_name='gemini-2.0-flash-exp',
        generation_config=genai.GenerationConfig(
            response_mime_type="application/json",
            response_schema=test_schema,
            temperature=0
        )
    )

    prompt = """Score these 2 Italian exercises (0-10):
1. "Io sono" - Basic present tense
2. "Loro mangiano pizza" - Correct verb conjugation

Return a JSON with a 'scores' array containing score and issue for each."""

    response = model.generate_content(prompt)
    print(f"✅ Structured output test successful!")
    print(f"Response: {response.text}")

    # Verify it's valid JSON
    import json
    parsed = json.loads(response.text)
    print(f"\n✅ Valid JSON with {len(parsed.get('scores', []))} scores")

except Exception as e:
    print(f"❌ Structured output test failed: {e}")
    print("\nThis suggests an issue with our JSON schema format.")
    traceback.print_exc()


🧪 TEST 2: GEMINI STRUCTURED OUTPUT (JSON SCHEMA)
Testing if Gemini can handle our JSON schema for scoring...

✅ Structured output test successful!
Response: {
  "scores": [
    {
      "score": 9,
      "issue": "Basic present tense, but fundamental."
    },
    {
      "score": 8,
      "issue": "Correct verb conjugation and simple sentence structure."
    }
  ]
}

✅ Valid JSON with 2 scores


In [None]:
# ==============================================================================
# CELL 6: BATCH SIZE TEST
# ==============================================================================

print("\n" + "="*80)
print("🧪 TEST 3: BATCH SIZE TOLERANCE")
print("="*80)
print("Testing different batch sizes to see if large batches cause 503 errors...\n")

batch_sizes = [1, 3, 5, 10]
batch_results = {}

for batch_size in batch_sizes:
    print(f"\nTesting batch size: {batch_size}")

    # Create a batch of exercises
    exercises = [
        f"Exercise {i}: Io ___ (essere) italiano."
        for i in range(batch_size)
    ]

    test_schema = {
        "type": "object",
        "properties": {
            "scores": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "score": {"type": "number"},
                        "issue": {"type": "string"}
                    },
                    "required": ["score", "issue"]
                }
            }
        },
        "required": ["scores"]
    }

    try:
        model = genai.GenerativeModel(
            model_name='gemini-2.0-flash-exp',
            generation_config=genai.GenerationConfig(
                response_mime_type="application/json",
                response_schema=test_schema,
                temperature=0
            )
        )

        prompt = f"""Score these {batch_size} Italian exercises (0-10 each):
{chr(10).join(exercises)}

Return a JSON with exactly {batch_size} scores."""

        start = time.time()
        response = model.generate_content(prompt)
        elapsed = time.time() - start

        parsed = json.loads(response.text)
        num_scores = len(parsed.get('scores', []))

        print(f"  ✅ Batch size {batch_size}: {elapsed:.2f}s, returned {num_scores} scores")
        batch_results[batch_size] = (True, elapsed, num_scores)

    except Exception as e:
        error_msg = str(e)
        if "503" in error_msg:
            print(f"  ❌ Batch size {batch_size}: 503 ERROR")
        else:
            print(f"  ❌ Batch size {batch_size}: {str(e)[:80]}")
        batch_results[batch_size] = (False, 0, 0)

    # Small delay between tests
    time.sleep(1)

print("\n--- Batch Size Summary ---")
print("Batch Size | Success | Time (s) | Scores Returned")
print("-----------|---------|----------|----------------")
for size, (success, elapsed, num_scores) in batch_results.items():
    status = "✅" if success else "❌"
    print(f"{size:<10} | {status:<7} | {elapsed:<8.2f} | {num_scores}")


🧪 TEST 3: BATCH SIZE TOLERANCE
Testing different batch sizes to see if large batches cause 503 errors...


Testing batch size: 1
  ✅ Batch size 1: 1.58s, returned 1 scores

Testing batch size: 3
  ✅ Batch size 3: 1.78s, returned 3 scores

Testing batch size: 5
  ✅ Batch size 5: 2.16s, returned 5 scores

Testing batch size: 10
  ✅ Batch size 10: 2.79s, returned 10 scores

--- Batch Size Summary ---
Batch Size | Success | Time (s) | Scores Returned
-----------|---------|----------|----------------
1          | ✅       | 1.58     | 1
3          | ✅       | 1.78     | 3
5          | ✅       | 2.16     | 5
10         | ✅       | 2.79     | 10


In [None]:
# ==============================================================================
# CELL 7: CONCURRENCY STRESS TEST
# ==============================================================================

print("\n" + "="*80)
print("🧪 TEST 4: CONCURRENCY STRESS TEST")
print("="*80)
print("Testing different concurrency levels to find optimal settings...\n")

# Apply nest_asyncio for notebook compatibility
try:
    import nest_asyncio
    nest_asyncio.apply()
    print("Applied nest_asyncio patch for notebook compatibility.\n")
except ImportError:
    pass

async def main():
    # Test different concurrency levels
    test_configs = [
        (5, 30, "Low Concurrency (5, 30s timeout)"),
        (10, 30, "Medium Concurrency (10, 30s timeout)"),
        (20, 30, "High Concurrency (20, 30s timeout)"),
        (10, 60, "Medium Concurrency with Longer Timeout (10, 60s)"),
    ]

    all_results = []

    for concurrency, timeout, test_name in test_configs:
        test_state, elapsed = await run_stress_test(concurrency, timeout, test_name)
        all_results.append((test_name, concurrency, timeout, test_state, elapsed))
        # Delay between tests
        await asyncio.sleep(3)

    # --- Final Analysis ---
    print("\n" + "="*80)
    print("📊 FINAL ANALYSIS & RECOMMENDATION")
    print("="*80)

    print("\nTest Name                                      | Success | 503 | 429 | Timeout | Time")
    print("-----------------------------------------------|---------|-----|-----|---------|------")
    for test_name, concurrency, timeout, test_state, elapsed in all_results:
        print(f"{test_name:<46} | {test_state.success_count:<7} | {test_state.error_503:<3} | {test_state.error_429:<3} | {test_state.timeout_errors:<7} | {elapsed:.1f}s")

    # Find best configuration
    best_config = None
    best_success_rate = 0

    for test_name, concurrency, timeout, test_state, elapsed in all_results:
        total_calls = NUM_SAMPLES * 3  # 3 scorers
        success_rate = test_state.success_count / total_calls if total_calls > 0 else 0

        if success_rate > best_success_rate:
            best_success_rate = success_rate
            best_config = (test_name, concurrency, timeout)

    if best_config and best_success_rate > 0.8:
        print(f"\n✅ Recommendation: {best_config[0]}")
        print(f"   Set concurrency_limit = {best_config[1]} in your training script.")
        print(f"   Success rate: {best_success_rate*100:.1f}%")
    else:
        print("\n⚠️ WARNING: All tests had significant errors.")
        print("   This suggests a fundamental issue with Gemini API access.")
        print("   Possible causes:")
        print("   - API key doesn't have proper permissions")
        print("   - Billing not set up correctly")
        print("   - Gemini service is experiencing issues")
        print("   - Rate limits are too restrictive for your account")

# Run the main test suite
asyncio.run(main())


🧪 TEST 4: CONCURRENCY STRESS TEST
Testing different concurrency levels to find optimal settings...

Applied nest_asyncio patch for notebook compatibility.

--------------------------------------------------------------------------------
🚀 Starting Test: Low Concurrency (5, 30s timeout)
   Concurrency = 5, Timeout = 30s
--------------------------------------------------------------------------------
Loading spaCy model: it_core_news_sm...
✅ spaCy model loaded
Reward function will use device: cpu
Initializing scorers...
  ✅ LLM scoring enabled for cefr_alignment (providers: Gemini, batch size: 10)
  ✅ LLM scoring enabled for grammar_correctness (providers: Gemini, batch size: 10)
  ✅ LLM scoring enabled for coherence (providers: Gemini, batch size: 10)
Loading sentence transformer for topic similarity...
✅ Sentence transformer loaded in cpu
✅ Reward function initialized. Active scorers: ['json', 'quality', 'linguistic', 'cefr', 'fluency', 'grammar', 'coherence', 'topic']
  🔄 Calling gem

KeyboardInterrupt: 