In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Navigate to project
%cd /content/drive/MyDrive/Colab\ Notebooks/italian_teacher

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/italian_teacher


In [2]:
# Install dependencies
!pip install -q transformers trl accelerate peft datasets spacy sentence-transformers bitsandbytes json5 openai tqdm nest_asyncio
!python -m spacy download it_core_news_sm

Collecting it-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.8.0/it_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m145.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import json
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOConfig, GRPOTrainer
from datasets import Dataset

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.8.0+cu126
CUDA available: True
GPU: NVIDIA L4


In [None]:
import os
from getpass import getpass

# You can enable/disable OpenAI here:
USE_OPENAI = True  # Set to False for faster training without OpenAI

OPENAI_API_KEY = getpass("Enter your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

✅ OpenAI API enabled - Professional quality with async batching
   OPTIMIZED: Samples 1 exercise/completion (70% reduction in API calls)
   Expected training time: ~2-3 hours


In [5]:
from src.rl.reward_function import ExerciseRewardFunction
from src.rl.prompt_formatter import format_prompt_with_chat_template  # ← ROUND 3: Enhanced V1 (not V3!)
from src.rl.multi_reward_async import create_async_multi_reward
import os



reward_fn = ExerciseRewardFunction(device="cuda")
print("✅ Reward function ready (running on GPU)")




Loading spaCy model: it_core_news_sm...
✅ spaCy model loaded
Reward function will use device: cuda
Initializing scorers...
Pre-loading CEFR vocabulary (16,887 words)...
✅ Loaded 16887 Italian words from vocabulary list
✅ Loaded vocabulary for all CEFR levels
  ✅ OpenAI validation enabled for B2+ levels
  ✅ LLM fluency checking enabled (OpenAI API)
  ✅ LLM grammar checking enabled (OpenAI API)
Loading sentence transformer for topic similarity...
✅ Sentence transformer loaded in cuda
  ✅ LLM topic checking enabled (OpenAI API)
  ✅ LLM coherence checking enabled (OpenAI API)
✅ Reward function initialized with 8 professional scorers
✅ Reward function ready (running on GPU)


In [6]:
training_requests = [  {
    "level": "A2",
    "grammar_focus": "past_tense",
    "topic": "abbigliamento",
    "num_exercises": 4,
    "exercise_types": [
      "multiple_choice",
      "fill_in_blank",
      "translation"
    ]
  },
  {
    "level": "B2",
    "grammar_focus": "conditional",
    "topic": "casa",
    "num_exercises": 3,
    "exercise_types": [
      "fill_in_blank"
    ]
  },
  {
    "level": "A2",
    "grammar_focus": "subjunctive",
    "topic": "casa",
    "num_exercises": 5,
    "exercise_types": [
      "translation",
      "multiple_choice",
      "fill_in_blank"
    ]
  },
  {
    "level": "B2",
    "grammar_focus": "conditional",
    "topic": "inganno",
    "num_exercises": 5,
    "exercise_types": [
      "translation"
    ]
  }]

In [7]:
import random
from datasets import Dataset
from transformers import AutoTokenizer

# ROUND 3: Start from Round 2 model (86.5/100 baseline)
# models/italian_v8_grpo_round2
MODEL_PATH = "./models/italian_v8_grpo_round2"  # ← Round 2 GRPO model (best so far)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

print("=" * 80)
print("📋 ROUND 3 DATASET PREPARATION")
print("=" * 80)

# Use V3 prompt formatter with ENHANCED guidance!
prompts = [
    format_prompt_with_chat_template(req, tokenizer, add_examples=True)
    for req in training_requests
]

reward_func = create_async_multi_reward(
    reward_fn,
    use_openai=USE_OPENAI,
    openai_batch_size=4,
    soft_penalties=False
)

📋 ROUND 3 DATASET PREPARATION


In [8]:
# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model with MEMORY OPTIMIZATIONS
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    use_cache=False,  # ⚠️ Disable KV cache during training (saves memory)
)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
# 1. Tokenize all prompts at once
#    - padding=True ensures all sequences in the batch have the same length.
#    - truncation=True prevents prompts from being longer than the model's max length.
print("Tokenizing prompts in a batch...")
inputs = tokenizer(
    prompts,
    return_tensors="pt",
    padding=True,
    truncation=True
).to(model.device)

# 2. Generate all outputs in a single, parallelized batch call
print(f"Generating {len(prompts)} completions in a single batch...")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=350,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
print("Generation complete.")

# 3. Decode all generated texts at once
#    Use batch_decode for efficiency.
generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print generated exercises
for i, gen in enumerate(generated_texts):
    # The generated text will include the original prompt. We can clean it up.
    clean_gen = gen[len(prompts[i]):].strip()
    print(f"\n--- Generated Exercise {i+1} ---\n{clean_gen}\n")

# Now you can pass the full lists to your reward function
# rewards = reward_func(prompts=prompts, completions=generated_texts, request=training_requests)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Tokenizing prompts in a batch...
Generating 4 completions in a single batch...
Generation complete.

--- Generated Exercise 1 ---
n": "past_tense of 'comprare'" },
  {"type": "fill_in_blank", "question": "Ieri, ___ ho portato ___ camicia preferita.", "correct_answer": "ho", "options": null, "explanation": "past_tense of 'portare'" },
  {"type": "fill_in_blank", "question": "La settimana scorsa, ___ ho cambiato ___ vestito.", "correct_answer": "ho", "options": null, "explanation": "past_tense of 'cambiare'" },
  {"type": "fill_in_blank", "question": "L'anno scorso, ___ non ho indossato ___ scarpa con la giacca.", "correct_answer": "non ho", "options": null, "explanation": "past_tense of 'indossare'" }
]


--- Generated Exercise 2 ---
ion": "Subject-verb agreement in conditional" },
  {"type": "fill_in_blank", "question": "Se ___ avessi una casa, avrei potuto organizzare una festa.", "correct_answer": "tu", "options": null, "explanation": "Subject-verb agreement in conditional" },
  {"ty

In [10]:
# Apply reward function to each generated output
# The reward_func is a synchronous function that handles async internally.
# Just call it directly.
print("\nEvaluating rewards...")
rewards = reward_func(prompts=prompts, completions=generated_texts, request=training_requests) # Assuming 'requests' is available

# Print rewards
print("\n--- Rewards ---")
for i, r in enumerate(rewards):
    print(f"Reward for Exercise {i+1}: {r:.4f}")



Evaluating rewards...

⏳ Step 1/3: Parsing 4 JSON completions...




   Checking 4 exercises (1 per completion, ~70% reduction)...




⏳ Step 3/3: Computing rule-based and grammar rewards (in parallel)...


Computing Rewards:   0%|          | 0/4 [00:00<?, ?it/s]

LLM coherence SUCCESS
LLM coherence SUCCESS
LLM coherence SUCCESS
LLM coherence SUCCESS
LLM coherence SUCCESS
LLM coherence SUCCESS
LLM coherence SUCCESS
LLM coherence SUCCESS
LLM coherence SUCCESS
LLM coherence SUCCESS
LLM grammer SUCCESS
LLM grammer SUCCESS
LLM grammer SUCCESS
LLM grammer SUCCESS
LLM grammer SUCCESS
LLM grammer SUCCESS
LLM grammer SUCCESS
LLM coherence SUCCESS


Computing Rewards:  25%|██▌       | 1/4 [00:03<00:10,  3.56s/it]

LLM coherence SUCCESS
LLM grammer SUCCESS
LLM grammer SUCCESS
LLM grammer SUCCESS


Computing Rewards:  50%|█████     | 2/4 [00:03<00:03,  1.59s/it]

LLM grammer SUCCESS


                                                                

LLM grammer SUCCESS

🎯 Multi-Reward (Async OpenAI, batch=4, 5.4s):
   Grammar   : min=0.500, max=0.500, avg=0.500 (weight=2.0)
   Coherence : min=1.000, max=1.000, avg=1.000 (weight=2.5)
   Topic     : min=0.067, max=0.467, avg=0.300 (weight=1.5)
   Quality   : min=0.792, max=0.843, avg=0.812 (weight=1.0)
   Diversity : min=1.000, max=1.000, avg=1.000 (weight=0.5)
   TOTAL     : min=4.892, max=5.523, avg=5.263

--- Rewards ---
Reward for Exercise 1: 5.1433
Reward for Exercise 2: 5.4917
Reward for Exercise 3: 5.5233
Reward for Exercise 4: 4.8917


