# GRPO Training for Italian Exercise Generator

## Setup

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Navigate to project
%cd /content/drive/MyDrive/Colab\ Notebooks/italian_teacher

MessageError: Error: credential propagation was unsuccessful

In [None]:
# Install dependencies
!pip install -q transformers trl accelerate peft datasets spacy sentence-transformers bitsandbytes json5 openai tqdm nest_asyncio
!python -m spacy download it_core_news_sm

In [None]:
import json
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOConfig, GRPOTrainer
from datasets import Dataset

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
import os
from getpass import getpass

# You can enable/disable OpenAI here:
USE_OPENAI = True  # Set to False for faster training without OpenAI

if USE_OPENAI:
    if "OPENAI_API_KEY" not in os.environ:
        OPENAI_API_KEY = ""
        os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    print("✅ OpenAI API enabled - Professional quality with async batching")
    print("   OPTIMIZED: Samples 1 exercise/completion (70% reduction in API calls)")
    print("   Expected training time: ~2-3 hours")
else:
    if "OPENAI_API_KEY" in os.environ:
        del os.environ["OPENAI_API_KEY"]
    print("✅ OpenAI API disabled - Fast rule-based rewards")
    print("   Expected training time: ~60-90 min")

## Load Reward Function

In [None]:
from src.rl.reward_function import ExerciseRewardFunction
from src.rl.prompt_formatter import format_prompt_with_chat_template  # ← ROUND 3: Enhanced V1 (not V3!)
from src.rl.multi_reward_async import create_async_multi_reward
import os

reward_fn = ExerciseRewardFunction(device="cuda")


## Load Training Requests

In [None]:
import os

# Load pre-generated training requests
if os.path.exists("src/rl/training_requests.json"):
    print("Loading existing training requests...")
    with open("src/rl/training_requests.json", "r") as f:
        training_requests = json.load(f)
else:
    # If not exists, generate them
    from src.rl.generate_training_requests import generate_training_requests
    print("Generating new training requests...")
    training_requests = generate_training_requests(
        num_requests=2000,
        output_path="src/rl/training_requests.json"
    )

print(f"✅ Loaded {len(training_requests)} training requests")

In [None]:
import random
from datasets import Dataset
from transformers import AutoTokenizer

# ROUND 3: Start from Round 2 model (86.5/100 baseline)
# models/italian_v8_grpo_round2
MODEL_PATH = "./models/italian_v8_grpo_round2"  # ← Round 2 GRPO model (best so far)
temp_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

print("=" * 80)
print("📋 ROUND 3 DATASET PREPARATION")
print("=" * 80)

# Use V3 prompt formatter with ENHANCED guidance!
prompts = [
    format_prompt_with_chat_template(req, temp_tokenizer, add_examples=True)
    for req in training_requests
]

# Round 3: Use 1000 samples (balanced between quality and training time)
ROUND3_SIZE = 2000
if len(prompts) > ROUND3_SIZE:
    random.seed(44)  # ← NEW seed for Round 3 (fresh samples)
    random_indices = random.sample(range(len(prompts)), ROUND3_SIZE)
    prompts = [prompts[i] for i in random_indices]
    training_requests_subset = [training_requests[i] for i in random_indices]
else:
    training_requests_subset = training_requests

# Create dataset
train_dataset = Dataset.from_dict({
    "prompt": prompts,
    "request": training_requests_subset,
})


In [None]:
reward_func = create_async_multi_reward(
    reward_fn,
    use_openai=USE_OPENAI,
    openai_batch_size=20,
    soft_penalties=False
)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model with MEMORY OPTIMIZATIONS
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    use_cache=False,  # ⚠️ Disable KV cache during training (saves memory)
)

# Enable gradient checkpointing (trades compute for memory)
model.gradient_checkpointing_enable()


In [None]:
# GRPO Configuration - ROUND 3 (Optimized for 90+ score)

grpo_config = GRPOConfig(
    output_dir="./models/italian_grpo_v4",  # ← Round 3 output

    # Training schedule
    num_train_epochs=1,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,

    # Learning rate - FINE-TUNED for Round 3
    learning_rate=9e-6,  # ← Lower than Round 2 (5e-6) for stability
    warmup_steps=50,     # ← More warmup for smooth convergence

    # Logging & checkpoints
    logging_steps=5,     # ← More frequent logging to catch issues early
    save_steps=100,
    save_total_limit=3,  # ← Keep more checkpoints

    # Precision
    bf16=True,
    remove_unused_columns=False,

    # Disable wandb
    report_to="none",

    # GRPO-specific - OPTIMIZED
    num_generations=4,              # Keep same (good variance)
    max_completion_length=350,      # ← REDUCED from 1000 (prevent rambling)
    temperature=0.7,                # Keep same
    generation_batch_size=32,       # Keep same

    # Stop tokens
    generation_kwargs={
      "bos_token_id": 128000,
      "do_sample": True,
      "eos_token_id": [
        128009,
        128001,
        128009
      ],
      "max_new_tokens": 350,
      "pad_token_id": 128009,
      "temperature": 0.7,
      "top_p": 0.9,
    }
)

## Initialize GRPO Trainer

In [None]:
trainer = GRPOTrainer(
    model=model,
    args=grpo_config,
    reward_funcs=reward_func,
    train_dataset=train_dataset,
    processing_class=tokenizer,
)

print("\n✅ GRPO Trainer initialized")
print("   Ready to start training!")

## Start Training

In [None]:
# Start training
trainer.train()


# Save model
output_dir = "./models/italian_grpo_v4"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
