# GRPO Training for Italian Exercise Generator

## Setup

In [None]:
# --- Cell 1: Setup and Imports ---

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Navigate to your project directory
# Make sure this path is correct for your Google Drive setup
import os
project_path = '/content/drive/MyDrive/Colab Notebooks/italian_teacher'
os.chdir(project_path)
print(f"Changed directory to: {os.getcwd()}")

# trl imported from local fork
# !pip install -e /content/drive/MyDrive/Colab\ Notebooks/trl

# Install dependencies (now includes google-generativeai for Gemini API)
!pip install -q trl transformers accelerate peft datasets spacy sentence-transformers bitsandbytes json5 openai google-generativeai tqdm nest_asyncio
!python -m spacy download it_core_news_sm
!pip install anthropic groq
!pip install flash-attn --no-build-isolation


# Standard library imports
import json
import random
from getpass import getpass

# Third-party imports
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer

# Local module imports
from src.rl.multi_reward_async import create_async_multi_reward
from src.rl.prompt_formatter import format_prompt_with_chat_template
from src.rl.reward_function.subjects.italian import ItalianRewardFunction

# Environment setup
os.environ["WANDB_DISABLED"] = "true"

print("\n--- Environment Setup ---")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# --- Cell 2: Configuration ---
# All training parameters are here for easy modification.

BASE_MODEL_PATH = "./models/TeacherPet_italian_grpo"  # Input model for this training run
OUTPUT_DIR = "./models/TeacherPet_italian_grpo_round2"      # Where the new model will be saved
NUM_SAMPLES = 1000                                    # Number of training requests to use
RANDOM_SEED = 42                                     # Seed for reproducibility

# Scorer settings
DISABLED_SCORERS = ["fluency"]         # No scorers disabled
FLUENCY_USE_LLM = False        # Use rule-based checks only (fast, free)

# --- GRPO Configuration ---
grpo_config = GRPOConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=4,     # slightly higher
    gradient_accumulation_steps=8,     # balances total effective batch
    learning_rate=5e-6,
    warmup_steps=50,
    logging_steps=10,
    save_steps=50,
    save_total_limit=3,
    bf16=True,
    remove_unused_columns=False,
    report_to="none",

    optim="adamw_torch",               # faster, stable memory use
    num_generations=2,                 # less VRAM
    generation_batch_size=2,           # more conservative
    max_prompt_length=768,             # trims longest samples
    max_completion_length=256,
    temperature=0.9,
    beta=0.05,

    generation_kwargs={
        "bos_token_id": 128000,
        "do_sample": True,
        "eos_token_id": [128009, 128001],
        "temperature": 0.9,
        "top_p": 0.9,
        "top_k": 50,
        "padding_side": "left",
    }
)


In [None]:
# --- Cell 3: Helper Functions & Main Execution ---

def load_secrets_from_file():
    """
    Load API keys from .secrets.json file if it exists.
    Checks multiple locations: Google Drive root, then current directory.
    """
    from pathlib import Path

    secrets_paths = [
        Path.home() / "Google Drive" / "My Drive" / ".secrets.json",  # Local path
        Path("/content/drive/My Drive/.secrets.json"),                # Colab path
        Path('.secrets.json')                                         # Current directory
    ]

    for path in secrets_paths:
        if path.exists():
            print(f"‚úÖ Loading API keys from {path}")
            with open(path, 'r') as f:
                secrets = json.load(f)

            loaded_keys = []
            for key, value in secrets.items():
                if value and value not in ["your-openai-key-here", "your-google-key-here", ""]:
                    os.environ[key] = value
                    loaded_keys.append(key)

            if loaded_keys:
                print(f"   Loaded {len(loaded_keys)} API key(s)")
                return True

    print("‚ö†Ô∏è  No .secrets.json found. Make sure API keys are in Colab secrets or environment.")
    return False


def load_training_data(tokenizer, num_samples: int, seed: int):
    """Load or generate training requests and prepare dataset."""
    requests_path = "src/rl/training_requests.json"

    if os.path.exists(requests_path):
        print(f"Loading existing training requests from {requests_path}...")
        with open(requests_path, "r") as f:
            training_requests = json.load(f)
    else:
        from src.rl.generate_training_requests import generate_training_requests
        print(f"Generating {num_samples} new training requests...")
        training_requests = generate_training_requests(
            num_requests=num_samples,
            output_path=requests_path
        )

    print(f"‚úÖ Loaded {len(training_requests)} training requests.")

    # Format prompts
    prompts = [
        format_prompt_with_chat_template(req, tokenizer, add_examples=True)
        for req in training_requests
    ]

    # Sample if needed
    if len(prompts) > num_samples:
        print(f"Sampling {num_samples} requests (seed={seed})...")
        random.seed(seed)
        random_indices = random.sample(range(len(prompts)), num_samples)
        prompts = [prompts[i] for i in random_indices]
        training_requests = [training_requests[i] for i in random_indices]

    dataset = Dataset.from_dict({
        "prompt": prompts,
        "request": training_requests,
    })

    # --- NEW: Print token length stats ---
    print("\nüìä Analyzing prompt token lengths...")
    token_lengths = [len(tokenizer(p)["input_ids"]) for p in prompts]
    print(f"   ‚Ä¢ Total samples: {len(token_lengths)}")
    print(f"   ‚Ä¢ Avg length: {sum(token_lengths)/len(token_lengths):.1f}")
    print(f"   ‚Ä¢ Max length: {max(token_lengths)}")
    print(f"   ‚Ä¢ 95th percentile: {sorted(token_lengths)[int(0.95*len(token_lengths))]}")

    return dataset


print("=" * 80)
print("üöÄ STARTING GRPO TRAINING")
print("=" * 80)

# Load API keys
print("\n--- Loading Secrets ---")
load_secrets_from_file()

# Load Model and Tokenizer
print(f"\n--- Loading Model ---")
print(f"Base model: {BASE_MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, padding_side='left')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2",
    use_cache=False,
)
model.gradient_checkpointing_enable()
torch.cuda.empty_cache()

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("‚úÖ Enabled expandable_segments for better VRAM handling.")

model.config.pad_token_id = tokenizer.pad_token_id
model.config.padding_side = tokenizer.padding_side
print("‚úÖ Model and tokenizer loaded.")

# Prepare Training Data
print("\n--- Preparing Training Data ---")
train_dataset = load_training_data(tokenizer, num_samples=NUM_SAMPLES, seed=RANDOM_SEED)

# Initialize Reward Function
print("\n--- Initializing Reward Function ---")
reward_fn_instance = ItalianRewardFunction(
    device="cuda",
    disabled_scorers=DISABLED_SCORERS,
    fluency_use_llm=FLUENCY_USE_LLM,
    concurrency_limit=3  # High concurrency for speed
)
reward_func = create_async_multi_reward(reward_fn_instance, use_openai=True)
print("‚úÖ Reward function ready.")

print(f'Tokenizer type: {type(tokenizer)}')
print(f'Padding side: {tokenizer.padding_side}')
if hasattr(tokenizer, 'tokenizer'):
    print(f'Has sub-tokenizer: {tokenizer.tokenizer.padding_side}')

In [None]:
# --- Add Validation Tracking ---
from src.rl.validation_callback import ValidationCallback, select_validation_samples
from src.rl.prompt_formatter import format_prompt_with_chat_template

# Select 10 diverse validation samples
validation_samples = select_validation_samples(
    training_requests_path="src/rl/training_requests.json",
    num_samples=10,  # Adjust this number (5-15 recommended)
    seed=RANDOM_SEED
)

# Format prompts for validation samples
validation_prompts = [
    format_prompt_with_chat_template(req, tokenizer, add_examples=True)
    for req in validation_samples
]

# Create validation callback
validation_callback = ValidationCallback(
    validation_samples=validation_samples,
    validation_prompts=validation_prompts,
    reward_function=reward_func,
    tokenizer=tokenizer,
    output_dir=OUTPUT_DIR,
    num_generations=3  # Generate 3 completions per sample for comparison
)

print("‚úÖ Validation callback ready.")


üìä Selecting 10 validation samples from src/rl/training_requests.json...
   Total requests available: 2000
   Exercise types: unknown
   ‚úÖ Selected 10 diverse samples
   Distribution: {'unknown': 10}

üìä Validation Callback initialized:
   10 validation samples
   3 generations per sample
   Results will be saved to: models/TeacherPet_italian_grpo_round2/validation_results
‚úÖ Validation callback ready.


In [None]:
# Initialize Trainer
print("\n--- Initializing GRPO Trainer ---")
trainer = GRPOTrainer(
    model=model,
    args=grpo_config,
    reward_funcs=reward_func,
    train_dataset=train_dataset,
    processing_class=tokenizer,
)


# After: trainer = GRPOTrainer(...)
print(f"\nüîç GRPO Generation Settings:")
print(f"   num_generations (config): {grpo_config.num_generations}")
print(f"   generation_batch_size: {grpo_config.generation_batch_size}")
if hasattr(trainer, 'generation_config'):
    print(f"   trainer.generation_config: {trainer.generation_config}")
print("‚úÖ GRPO Trainer initialized.")


# Start Training
print("\n" + "=" * 80)
print("üî• TRAINING BEGINS")
print("=" * 80)
trainer.train()
print("\n" + "=" * 80)
print("üéâ TRAINING COMPLETE")
print("=" * 80)

# Save Final Model
print(f"\n--- Saving Model ---")
print(f"Output directory: {OUTPUT_DIR}")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("‚úÖ Model saved successfully.")

The model is already on multiple devices. Skipping the move to device specified in `args`.



--- Initializing GRPO Trainer ---


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


üîç GRPO Generation Settings:
   num_generations (config): 3
   generation_batch_size: 3
   trainer.generation_config: GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128009,
    128001
  ],
  "max_new_tokens": 320,
  "pad_token_id": 128009,
  "padding_side": "left",
  "temperature": 0.9,
  "top_p": 0.9
}

‚úÖ GRPO Trainer initialized.

üî• TRAINING BEGINS


`generation_config` default values have been modified to match model-specific defaults: {'max_length': 8192}. If this is not desired, please set these values explicitly.


   [Call #1 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (31.8s):
   Grammar   : min=56.7, max=100.0, avg=78.9
   Coherence : min=60.0, max=78.0, avg=71.6
   Topic     : min=92.0, max=93.3, avg=92.9
   Quality   : min=83.3, max=90.0, avg=85.6
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=74.091, max=79.182, avg=77.354




   [Call #2 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (19.0s):
   Grammar   : min=0.0, max=0.0, avg=0.0
   Coherence : min=60.0, max=80.0, avg=68.3
   Topic     : min=10.0, max=30.0, avg=20.0
   Quality   : min=100.0, max=100.0, avg=100.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=55.909, max=63.182, avg=60.076




   [Call #3 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (30.1s):
   Grammar   : min=50.0, max=77.5, avg=67.5
   Coherence : min=75.0, max=85.0, avg=79.2
   Topic     : min=90.0, max=92.5, avg=91.7
   Quality   : min=77.5, max=100.0, avg=92.5
   Diversity : min=86.7, max=100.0, avg=91.1
   TOTAL     : min=75.152, max=80.492, avg=77.563




   [Call #4 for step 0: scoring 3 completions]

‚è≥ Step 1/3: Parsing 3 JSON completions...


                                                   

   Parse stats: 3/3 valid JSON (100.0%), 0 empty, 0 failed ‚Üí 3 scorable
‚è≥ Step 2/3: Scoring 3 completions with batched reward function...





   üìä Model Usage Distribution (9 total requests):
      üîµ Gemini: 3/9 (33.3%)
      üü¢ Openai: 3/9 (33.3%)
      ‚ùì Deepseek: 3/9 (33.3%)
‚è≥ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


üéØ Reward calculation complete (28.0s):
   Grammar   : min=0.0, max=0.0, avg=0.0
   Coherence : min=47.5, max=65.0, avg=54.2
   Topic     : min=15.0, max=50.0, avg=35.8
   Quality   : min=50.0, max=50.0, avg=50.0
   Diversity : min=70.0, max=100.0, avg=90.0
   TOTAL     : min=57.273, max=58.864, avg=57.955




KeyboardInterrupt: 

In [None]:
import time, os, signal
from google.colab import runtime

print("‚è≥ Waiting 3 minutes (180 seconds) before disconnecting...")
time.sleep(180)

print("üîå Attempting clean disconnect...")
try:
    runtime.disconnect()
    print("‚úÖ Clean disconnect attempted. Waiting 5 seconds to verify...")
    time.sleep(5)
except Exception as e:
    print(f"‚ö†Ô∏è Clean disconnect failed: {e}")

# Final guarantee: forcefully kill the process
print("üíÄ Forcing runtime shutdown now...")
os.kill(os.getpid(), signal.SIGKILL)
