# GRPO Training for Italian Exercise Generator

## Setup

In [1]:
# --- Cell 1: Setup and Imports ---

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Navigate to your project directory
# Make sure this path is correct for your Google Drive setup
import os
project_path = '/content/drive/MyDrive/Colab Notebooks/italian_teacher'
os.chdir(project_path)
print(f"Changed directory to: {os.getcwd()}")

# trl imported from local fork
# !pip install -e /content/drive/MyDrive/Colab\ Notebooks/trl

# Install dependencies (now includes google-generativeai for Gemini API)
!pip install -q trl transformers accelerate peft datasets spacy sentence-transformers bitsandbytes json5 openai google-generativeai tqdm nest_asyncio
!python -m spacy download it_core_news_sm
!pip install anthropic groq

# Standard library imports
import json
import random
from getpass import getpass

# Third-party imports
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer

# Local module imports
from src.rl.multi_reward_async import create_async_multi_reward
from src.rl.prompt_formatter import format_prompt_with_chat_template
from src.rl.reward_function import ExerciseRewardFunction

# Environment setup
os.environ["WANDB_DISABLED"] = "true"

print("\n--- Environment Setup ---")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Changed directory to: /content/drive/MyDrive/Colab Notebooks/italian_teacher
Collecting it-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.8.0/it_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m13.0/13.0 MB[0m [31m135.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.

--- Environment Setup ---
PyTorch version:

In [2]:
# --- Cell 2: Configuration ---
# All training parameters are here for easy modification.

BASE_MODEL_PATH = "./models/TeacherPet_italian_grpo"  # Input model for this training run
OUTPUT_DIR = "./models/TeacherPet_italian_grpo_round2"      # Where the new model will be saved
NUM_SAMPLES = 1000                                    # Number of training requests to use
RANDOM_SEED = 42                                     # Seed for reproducibility

# Scorer settings
DISABLED_SCORERS = []          # No scorers disabled
FLUENCY_USE_LLM = False        # Use rule-based checks only (fast, free)

# --- GRPO Configuration ---
grpo_config = GRPOConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=3,
    gradient_accumulation_steps=36,
    learning_rate=5e-6,
    warmup_steps=50,
    logging_steps=10,
    save_steps=25,
    save_total_limit=3,
    bf16=True,
    remove_unused_columns=False,
    report_to="none",

    # ‚Üê ADD THIS
    optim="paged_adamw_8bit",  # Uses 8-bit optimizer (saves ~16GB)

    num_generations=3,
    max_prompt_length=1024,
    max_completion_length=350,
    temperature=0.9,
    generation_batch_size=3,
    beta=0.05,

    generation_kwargs={
        "bos_token_id": 128000,
        "do_sample": True,
        "eos_token_id": [128009, 128001],
        "temperature": 0.9,
        "top_p": 0.9,
        "top_k": 50,
        "padding_side": "left",
    }
)

In [3]:
# --- Cell 3: Helper Functions & Main Execution ---

def load_secrets_from_file():
    """
    Load API keys from .secrets.json file if it exists.
    Checks multiple locations: Google Drive root, then current directory.
    """
    from pathlib import Path

    secrets_paths = [
        Path.home() / "Google Drive" / "My Drive" / ".secrets.json",  # Local path
        Path("/content/drive/My Drive/.secrets.json"),                # Colab path
        Path('.secrets.json')                                         # Current directory
    ]

    for path in secrets_paths:
        if path.exists():
            print(f"‚úÖ Loading API keys from {path}")
            with open(path, 'r') as f:
                secrets = json.load(f)

            loaded_keys = []
            for key, value in secrets.items():
                if value and value not in ["your-openai-key-here", "your-google-key-here", ""]:
                    os.environ[key] = value
                    loaded_keys.append(key)

            if loaded_keys:
                print(f"   Loaded {len(loaded_keys)} API key(s)")
                return True

    print("‚ö†Ô∏è  No .secrets.json found. Make sure API keys are in Colab secrets or environment.")
    return False


def load_training_data(tokenizer, num_samples: int, seed: int):
    """Load or generate training requests and prepare dataset."""
    requests_path = "src/rl/training_requests.json"

    if os.path.exists(requests_path):
        print(f"Loading existing training requests from {requests_path}...")
        with open(requests_path, "r") as f:
            training_requests = json.load(f)
    else:
        from src.rl.generate_training_requests import generate_training_requests
        print(f"Generating {num_samples} new training requests...")
        training_requests = generate_training_requests(
            num_requests=num_samples,
            output_path=requests_path
        )

    print(f"‚úÖ Loaded {len(training_requests)} training requests.")

    # Format prompts
    prompts = [
        format_prompt_with_chat_template(req, tokenizer, add_examples=True)
        for req in training_requests
    ]

    # Sample if needed
    if len(prompts) > num_samples:
        print(f"Sampling {num_samples} requests (seed={seed})...")
        random.seed(seed)
        random_indices = random.sample(range(len(prompts)), num_samples)
        prompts = [prompts[i] for i in random_indices]
        training_requests = [training_requests[i] for i in random_indices]

    return Dataset.from_dict({
        "prompt": prompts,
        "request": training_requests,
    })


print("=" * 80)
print("üöÄ STARTING GRPO TRAINING")
print("=" * 80)

# Load API keys
print("\n--- Loading Secrets ---")
load_secrets_from_file()

# Load Model and Tokenizer
print(f"\n--- Loading Model ---")
print(f"Base model: {BASE_MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, padding_side='left')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    use_cache=False,
)
model.gradient_checkpointing_enable()
model.config.pad_token_id = tokenizer.pad_token_id
model.config.padding_side = tokenizer.padding_side
print("‚úÖ Model and tokenizer loaded.")

# Prepare Training Data
print("\n--- Preparing Training Data ---")
train_dataset = load_training_data(tokenizer, num_samples=NUM_SAMPLES, seed=RANDOM_SEED)

# Initialize Reward Function
print("\n--- Initializing Reward Function ---")
reward_fn_instance = ExerciseRewardFunction(
    device="cuda",
    disabled_scorers=DISABLED_SCORERS,
    fluency_use_llm=FLUENCY_USE_LLM,
    concurrency_limit=3  # High concurrency for speed
)
reward_func = create_async_multi_reward(reward_fn_instance, use_openai=True)
print("‚úÖ Reward function ready.")

print(f'Tokenizer type: {type(tokenizer)}')
print(f'Padding side: {tokenizer.padding_side}')
if hasattr(tokenizer, 'tokenizer'):
    print(f'Has sub-tokenizer: {tokenizer.tokenizer.padding_side}')


üöÄ STARTING GRPO TRAINING

--- Loading Secrets ---
‚úÖ Loading API keys from /content/drive/My Drive/.secrets.json
   Loaded 9 API key(s)

--- Loading Model ---
Base model: ./models/TeacherPet_italian_grpo


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

‚úÖ Model and tokenizer loaded.

--- Preparing Training Data ---
Loading existing training requests from src/rl/training_requests.json...
‚úÖ Loaded 2000 training requests.
Sampling 1000 requests (seed=42)...

--- Initializing Reward Function ---
Loading spaCy model: it_core_news_sm...
‚úÖ spaCy model loaded
Reward function will use device: cuda
     ‚úÖ Gemini: 4 API key(s)
     ‚úÖ OpenAI: configured
     ‚úÖ Anthropic: configured
     ‚úÖ Groq: configured
     ‚úÖ DeepSeek: configured
     ‚úÖ Cerebras: configured
  ‚úÖ LLM API Handler initialized
     Providers: gemini, openai, anthropic, groq, deepseek, cerebras
     Total models: 12
Initializing scorers...
  ‚úÖ LLM scoring enabled for cefr_alignment (batch size: 10)
  ‚úÖ LLM scoring enabled for fluency (batch size: 10)
  ‚úÖ LLM scoring enabled for grammar_correctness (batch size: 10)
  ‚úÖ LLM scoring enabled for coherence (batch size: 10)
Loading sentence transformer for topic similarity...
‚úÖ Sentence transformer loaded in 

In [4]:
# --- Add Validation Tracking ---
from src.rl.validation_callback import ValidationCallback, select_validation_samples
from src.rl.prompt_formatter import format_prompt_with_chat_template

# Select 10 diverse validation samples
validation_samples = select_validation_samples(
    training_requests_path="src/rl/training_requests.json",
    num_samples=10,  # Adjust this number (5-15 recommended)
    seed=RANDOM_SEED
)

# Format prompts for validation samples
validation_prompts = [
    format_prompt_with_chat_template(req, tokenizer, add_examples=True)
    for req in validation_samples
]

# Create validation callback
validation_callback = ValidationCallback(
    validation_samples=validation_samples,
    validation_prompts=validation_prompts,
    reward_function=reward_func,
    tokenizer=tokenizer,
    output_dir=OUTPUT_DIR,
    num_generations=3  # Generate 3 completions per sample for comparison
)

print("‚úÖ Validation callback ready.")


üìä Selecting 10 validation samples from src/rl/training_requests.json...
   Total requests available: 2000
   Exercise types: unknown
   ‚úÖ Selected 10 diverse samples
   Distribution: {'unknown': 10}

üìä Validation Callback initialized:
   10 validation samples
   3 generations per sample
   Results will be saved to: models/TeacherPet_italian_grpo_round2/validation_results
‚úÖ Validation callback ready.


In [5]:
# Initialize Trainer
print("\n--- Initializing GRPO Trainer ---")
trainer = GRPOTrainer(
    model=model,
    args=grpo_config,
    reward_funcs=reward_func,
    train_dataset=train_dataset,
    processing_class=tokenizer,
)


# After: trainer = GRPOTrainer(...)
print(f"\nüîç GRPO Generation Settings:")
print(f"   num_generations (config): {grpo_config.num_generations}")
print(f"   generation_batch_size: {grpo_config.generation_batch_size}")
if hasattr(trainer, 'generation_config'):
    print(f"   trainer.generation_config: {trainer.generation_config}")
print("‚úÖ GRPO Trainer initialized.")


# Start Training
print("\n" + "=" * 80)
print("üî• TRAINING BEGINS")
print("=" * 80)
trainer.train()
print("\n" + "=" * 80)
print("üéâ TRAINING COMPLETE")
print("=" * 80)

# Save Final Model
print(f"\n--- Saving Model ---")
print(f"Output directory: {OUTPUT_DIR}")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("‚úÖ Model saved successfully.")

The model is already on multiple devices. Skipping the move to device specified in `args`.



--- Initializing GRPO Trainer ---


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


üîç GRPO Generation Settings:
   num_generations (config): 3
   generation_batch_size: 3
   trainer.generation_config: GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128009,
    128001
  ],
  "max_new_tokens": 350,
  "pad_token_id": 128009,
  "padding_side": "left",
  "temperature": 0.9,
  "top_p": 0.9
}

‚úÖ GRPO Trainer initialized.

üî• TRAINING BEGINS


`generation_config` default values have been modified to match model-specific defaults: {'max_length': 8192}. If this is not desired, please set these values explicitly.


   [Call #1 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 2/3 valid JSON (66.7%), 0 empty, 1 failed ‚Üí 2 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (6 total requests):
      üîµ Gemini: 2/6 (33.3%)
      üü¢ Openai: 2/6 (33.3%)
      ‚ö° Groq: 1/6 (16.7%)
      ‚ùì Deepseek: 1/6 (16.7%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (24.0s):
   Grammar   : min=40.0, max=100.0, avg=70.0
   Coherence : min=76.7, max=88.0, avg=82.3
   Topic     : min=93.3, max=94.0, avg=93.7
   Quality   : min=83.3, max=90.0, avg=86.7
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=0.000, max=72.222, avg=47.213




   [Call #2 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (21.7s):
   Grammar   : min=0.0, max=0.0, avg=0.0
   Coherence : min=55.0, max=65.0, avg=61.7
   Topic     : min=0.0, max=10.0, avg=6.7
   Quality   : min=75.0, max=100.0, avg=91.7
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=49.167, max=54.583, avg=52.569




   [Call #3 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 2/9 (22.2%)
      ‚ö° Groq: 1/9 (11.1%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (28.7s):
   Grammar   : min=87.5, max=100.0, avg=95.8
   Coherence : min=77.5, max=85.0, avg=80.8
   Topic     : min=97.5, max=100.0, avg=99.2
   Quality   : min=87.5, max=100.0, avg=95.8
   Diversity : min=86.7, max=100.0, avg=91.1
   TOTAL     : min=73.229, max=76.597, avg=74.468




   [Call #4 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (31.3s):
   Grammar   : min=0.0, max=0.0, avg=0.0
   Coherence : min=35.0, max=55.0, avg=44.2
   Topic     : min=20.0, max=55.0, avg=31.7
   Quality   : min=50.0, max=50.0, avg=50.0
   Diversity : min=85.0, max=100.0, avg=95.0
   TOTAL     : min=45.938, max=53.021, avg=50.139




   [Call #5 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (26.2s):
   Grammar   : min=0.0, max=0.0, avg=0.0
   Coherence : min=43.3, max=53.3, avg=48.9
   Topic     : min=30.0, max=40.0, avg=36.7
   Quality   : min=50.0, max=50.0, avg=50.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=51.806, max=52.361, avg=52.176




   [Call #6 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      ‚ö° Groq: 4/9 (44.4%)
      üü¢ Openai: 3/9 (33.3%)
      üîµ Gemini: 1/9 (11.1%)
      ‚ùì Deepseek: 1/9 (11.1%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (38.1s):
   Grammar   : min=0.0, max=60.0, avg=31.3
   Coherence : min=62.0, max=82.0, avg=72.0
   Topic     : min=28.0, max=82.0, avg=56.0
   Quality   : min=100.0, max=100.0, avg=100.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=61.583, max=70.333, avg=66.778




   [Call #7 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      ‚ö° Groq: 4/9 (44.4%)
      üü¢ Openai: 3/9 (33.3%)
      üîµ Gemini: 1/9 (11.1%)
      ‚ùì Deepseek: 1/9 (11.1%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (37.3s):
   Grammar   : min=0.0, max=100.0, avg=58.3
   Coherence : min=50.0, max=95.0, avg=76.7
   Topic     : min=97.5, max=100.0, avg=98.3
   Quality   : min=100.0, max=100.0, avg=100.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=69.271, max=77.708, avg=73.299




   [Call #8 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      ‚ö° Groq: 6/9 (66.7%)
      üü¢ Openai: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (39.0s):
   Grammar   : min=20.0, max=65.0, avg=45.0
   Coherence : min=75.0, max=85.0, avg=80.0
   Topic     : min=80.0, max=100.0, avg=92.5
   Quality   : min=87.5, max=100.0, avg=91.7
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=67.708, max=78.438, avg=73.056




   [Call #9 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (28.4s):
   Grammar   : min=16.7, max=100.0, avg=57.8
   Coherence : min=73.3, max=80.0, avg=77.8
   Topic     : min=0.0, max=6.7, avg=4.4
   Quality   : min=100.0, max=100.0, avg=100.0
   Diversity : min=86.7, max=86.7, avg=86.7
   TOTAL     : min=59.444, max=66.389, avg=62.685




   [Call #10 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (28.0s):
   Grammar   : min=66.7, max=80.0, avg=71.1
   Coherence : min=60.0, max=80.0, avg=71.1
   Topic     : min=70.0, max=100.0, avg=90.0
   Quality   : min=83.3, max=100.0, avg=94.4
   Diversity : min=86.7, max=100.0, avg=91.1
   TOTAL     : min=68.333, max=71.389, avg=70.000




   [Call #11 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ö° Groq: 2/9 (22.2%)
      ‚ùì Deepseek: 1/9 (11.1%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (31.7s):
   Grammar   : min=0.0, max=28.0, avg=9.3
   Coherence : min=34.0, max=44.0, avg=39.3
   Topic     : min=84.0, max=92.0, avg=86.7
   Quality   : min=50.0, max=50.0, avg=50.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=51.833, max=53.083, avg=52.528




   [Call #12 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (26.7s):
   Grammar   : min=35.0, max=50.0, avg=41.7
   Coherence : min=50.0, max=60.0, avg=56.7
   Topic     : min=0.0, max=50.0, avg=20.0
   Quality   : min=75.0, max=75.0, avg=75.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=55.625, max=58.542, avg=57.292




   [Call #13 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (22.3s):
   Grammar   : min=50.0, max=100.0, avg=83.3
   Coherence : min=50.0, max=65.0, avg=55.0
   Topic     : min=20.0, max=20.0, avg=20.0
   Quality   : min=75.0, max=75.0, avg=75.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=56.042, max=62.917, avg=60.625




   [Call #14 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (26.9s):
   Grammar   : min=0.0, max=0.0, avg=0.0
   Coherence : min=60.0, max=65.0, avg=61.7
   Topic     : min=70.0, max=100.0, avg=86.7
   Quality   : min=75.0, max=75.0, avg=75.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=56.042, max=62.500, avg=59.792




   [Call #15 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (22.8s):
   Grammar   : min=65.0, max=100.0, avg=88.3
   Coherence : min=30.0, max=75.0, avg=46.7
   Topic     : min=35.0, max=100.0, avg=78.3
   Quality   : min=100.0, max=100.0, avg=100.0
   Diversity : min=80.0, max=100.0, avg=93.3
   TOTAL     : min=58.542, max=72.083, avg=67.569




   [Call #16 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (25.1s):
   Grammar   : min=0.0, max=50.0, avg=31.7
   Coherence : min=35.0, max=65.0, avg=50.0
   Topic     : min=35.0, max=80.0, avg=50.0
   Quality   : min=100.0, max=100.0, avg=100.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=52.917, max=65.000, avg=59.722




   [Call #17 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (17.6s):
   Grammar   : min=100.0, max=100.0, avg=100.0
   Coherence : min=40.0, max=90.0, avg=70.0
   Topic     : min=20.0, max=100.0, avg=70.0
   Quality   : min=100.0, max=100.0, avg=100.0
   Diversity : min=70.0, max=70.0, avg=70.0
   TOTAL     : min=75.833, max=82.500, avg=78.889




   [Call #18 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 2/3 valid JSON (66.7%), 0 empty, 1 failed ‚Üí 2 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (6 total requests):
      üîµ Gemini: 2/6 (33.3%)
      üü¢ Openai: 2/6 (33.3%)
      ‚ùì Deepseek: 2/6 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (22.0s):
   Grammar   : min=0.0, max=25.0, avg=12.5
   Coherence : min=62.5, max=77.5, avg=70.0
   Topic     : min=37.5, max=55.0, avg=46.2
   Quality   : min=75.0, max=77.5, avg=76.2
   Diversity : min=86.7, max=100.0, avg=93.3
   TOTAL     : min=0.000, max=58.438, avg=38.900




   [Call #19 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (6 total requests):
      üü¢ Openai: 3/6 (50.0%)
      ‚ùì Deepseek: 3/6 (50.0%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (16.0s):
   Grammar   : min=0.0, max=0.0, avg=0.0
   Coherence : min=35.0, max=65.0, avg=51.7
   Topic     : min=85.0, max=95.0, avg=91.7
   Quality   : min=50.0, max=50.0, avg=50.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=51.875, max=58.333, avg=55.417




   [Call #20 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (6 total requests):
      üü¢ Openai: 3/6 (50.0%)
      ‚ùì Deepseek: 3/6 (50.0%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (20.4s):
   Grammar   : min=0.0, max=0.0, avg=0.0
   Coherence : min=36.7, max=43.3, avg=40.0
   Topic     : min=30.0, max=93.3, avg=65.6
   Quality   : min=100.0, max=100.0, avg=100.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=59.167, max=64.861, avg=61.250




   [Call #21 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (20.0s):
   Grammar   : min=5.0, max=100.0, avg=56.7
   Coherence : min=60.0, max=65.0, avg=61.7
   Topic     : min=85.0, max=90.0, avg=88.3
   Quality   : min=75.0, max=75.0, avg=75.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=62.292, max=66.250, avg=64.583




   [Call #22 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (6 total requests):
      üü¢ Openai: 3/6 (50.0%)
      ‚ùì Deepseek: 3/6 (50.0%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (21.9s):
   Grammar   : min=0.0, max=0.0, avg=0.0
   Coherence : min=70.0, max=86.7, avg=80.0
   Topic     : min=56.7, max=90.0, avg=71.1
   Quality   : min=100.0, max=100.0, avg=100.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=62.778, max=67.500, avg=65.370




   [Call #23 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (21.6s):
   Grammar   : min=0.0, max=0.0, avg=0.0
   Coherence : min=35.0, max=100.0, avg=63.3
   Topic     : min=70.0, max=100.0, avg=85.0
   Quality   : min=50.0, max=83.3, avg=69.4
   Diversity : min=80.0, max=100.0, avg=93.3
   TOTAL     : min=56.875, max=62.639, avg=59.005




   [Call #24 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (26.4s):
   Grammar   : min=50.0, max=100.0, avg=70.0
   Coherence : min=60.0, max=85.0, avg=68.3
   Topic     : min=95.0, max=100.0, avg=98.3
   Quality   : min=50.0, max=75.0, avg=66.7
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=62.500, max=71.250, avg=65.486




   [Call #25 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (23.2s):
   Grammar   : min=0.0, max=0.0, avg=0.0
   Coherence : min=35.0, max=70.0, avg=53.3
   Topic     : min=50.0, max=100.0, avg=78.3
   Quality   : min=75.0, max=75.0, avg=75.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=53.750, max=60.208, avg=57.569




   [Call #26 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (26.1s):
   Grammar   : min=0.0, max=0.0, avg=0.0
   Coherence : min=53.3, max=70.0, avg=60.0
   Topic     : min=46.7, max=70.0, avg=62.2
   Quality   : min=83.3, max=83.3, avg=83.3
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=58.333, max=60.556, avg=59.722




   [Call #27 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ö° Groq: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (34.4s):
   Grammar   : min=62.5, max=75.0, avg=66.7
   Coherence : min=60.0, max=85.0, avg=70.0
   Topic     : min=52.5, max=82.5, avg=68.3
   Quality   : min=75.0, max=100.0, avg=83.3
   Diversity : min=86.7, max=100.0, avg=95.6
   TOTAL     : min=67.812, max=78.472, avg=72.025




   [Call #28 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (28.9s):
   Grammar   : min=0.0, max=32.5, avg=10.8
   Coherence : min=70.0, max=85.0, avg=76.1
   Topic     : min=92.5, max=97.5, avg=94.4
   Quality   : min=77.5, max=83.3, avg=79.4
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=63.958, max=68.333, avg=65.856




   [Call #29 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (24.8s):
   Grammar   : min=50.0, max=100.0, avg=71.7
   Coherence : min=60.0, max=70.0, avg=65.0
   Topic     : min=60.0, max=100.0, avg=80.0
   Quality   : min=55.0, max=75.0, avg=68.3
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=63.750, max=68.333, avg=66.111




   [Call #30 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (21.3s):
   Grammar   : min=0.0, max=50.0, avg=16.7
   Coherence : min=65.0, max=70.0, avg=66.7
   Topic     : min=10.0, max=95.0, avg=55.0
   Quality   : min=80.0, max=100.0, avg=93.3
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=59.792, max=67.083, avg=62.986




   [Call #31 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 2/3 valid JSON (66.7%), 0 empty, 1 failed ‚Üí 2 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (6 total requests):
      üîµ Gemini: 2/6 (33.3%)
      üü¢ Openai: 2/6 (33.3%)
      ‚ö° Groq: 1/6 (16.7%)
      ‚ùì Deepseek: 1/6 (16.7%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (26.3s):
   Grammar   : min=60.0, max=100.0, avg=80.0
   Coherence : min=53.3, max=80.0, avg=66.7
   Topic     : min=63.3, max=75.0, avg=69.2
   Quality   : min=83.3, max=87.5, avg=85.4
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=0.000, max=78.646, avg=46.863




   [Call #32 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ö° Groq: 2/9 (22.2%)
      ‚ùì Deepseek: 1/9 (11.1%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (29.8s):
   Grammar   : min=0.0, max=100.0, avg=33.3
   Coherence : min=47.5, max=75.0, avg=62.5
   Topic     : min=5.0, max=30.0, avg=15.0
   Quality   : min=10.0, max=100.0, avg=65.8
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=40.208, max=66.354, avg=55.312




   [Call #33 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (23.6s):
   Grammar   : min=0.0, max=66.7, avg=22.2
   Coherence : min=36.7, max=50.0, avg=43.3
   Topic     : min=70.0, max=90.0, avg=77.8
   Quality   : min=100.0, max=100.0, avg=100.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=58.889, max=68.889, avg=64.120




   [Call #34 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...




KeyboardInterrupt: 

In [None]:
import time, os, signal
from google.colab import runtime

print("‚è≥ Waiting 3 minutes (180 seconds) before disconnecting...")
time.sleep(180)

print("üîå Attempting clean disconnect...")
try:
    runtime.disconnect()
    print("‚úÖ Clean disconnect attempted. Waiting 5 seconds to verify...")
    time.sleep(5)
except Exception as e:
    print(f"‚ö†Ô∏è Clean disconnect failed: {e}")

# Final guarantee: forcefully kill the process
print("üíÄ Forcing runtime shutdown now...")
os.kill(os.getpid(), signal.SIGKILL)
