# GRPO Training for Italian Exercise Generator

## Setup

In [1]:
# --- Cell 1: Setup and Imports ---

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Navigate to your project directory
# Make sure this path is correct for your Google Drive setup
import os
project_path = '/content/drive/MyDrive/Colab Notebooks/italian_teacher'
os.chdir(project_path)
print(f"Changed directory to: {os.getcwd()}")

# Install dependencies (now includes google-generativeai for Gemini API)
!pip install -q transformers trl accelerate peft datasets spacy sentence-transformers bitsandbytes json5 openai google-generativeai tqdm nest_asyncio
!python -m spacy download it_core_news_sm
!pip install anthropic groq

# Standard library imports
import json
import random
from getpass import getpass

# Third-party imports
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer

# Local module imports
from src.rl.multi_reward_async import create_async_multi_reward
from src.rl.prompt_formatter import format_prompt_with_chat_template
from src.rl.reward_function import ExerciseRewardFunction

# Environment setup
os.environ["WANDB_DISABLED"] = "true"

print("\n--- Environment Setup ---")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Changed directory to: /content/drive/MyDrive/Colab Notebooks/italian_teacher
Collecting it-core-news-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.8.0/it_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.

--- Environment Setup ---
PyTorch version: 2.8.0+cu126
CUDA available: True
GPU: NVIDIA A100-SXM4-80GB


In [2]:
# --- Cell 2: Configuration ---
# All training parameters are here for easy modification.

BASE_MODEL_PATH = "./models/italian_v8_grpo_round2"  # Input model for this training run
OUTPUT_DIR = "./models/TeacherPet_italian_grpo"      # Where the new model will be saved
NUM_SAMPLES = 2000                                    # Number of training requests to use
RANDOM_SEED = 44                                      # Seed for reproducibility

# Scorer settings
DISABLED_SCORERS = []          # No scorers disabled
FLUENCY_USE_LLM = False        # Use rule-based checks only (fast, free)

# --- GRPO Configuration ---
grpo_config = GRPOConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=12,
    gradient_accumulation_steps=4,
    learning_rate=5e-6,
    warmup_steps=50,
    logging_steps=5,
    save_steps=100,
    save_total_limit=3,
    bf16=True,
    remove_unused_columns=False,
    report_to="none",

    # GRPO-specific generation settings
    num_generations=3,
    max_completion_length=350,
    temperature=0.7,
    generation_batch_size=24,

    # Generation kwargs
    generation_kwargs={
      "bos_token_id": 128000,
      "do_sample": True,
      "eos_token_id": [128009, 128001],
      "pad_token_id": 128009,
      "top_p": 0.9,
      "padding_side": "left"
    }
)

print("✅ Configuration loaded.")
print(f"   Base model: {BASE_MODEL_PATH}")
print(f"   Output directory: {OUTPUT_DIR}")
print(f"   Training samples: {NUM_SAMPLES}")
print(f"   Fluency scorer: {'Enabled (rule-based)' if not DISABLED_SCORERS else 'Disabled'}")

✅ Configuration loaded.
   Base model: ./models/italian_v8_grpo_round2
   Output directory: ./models/TeacherPet_italian_grpo
   Training samples: 2000
   Fluency scorer: Enabled (rule-based)


In [3]:
# --- Cell 3: Helper Functions & Main Execution ---

def load_secrets_from_file():
    """
    Load API keys from .secrets.json file if it exists.
    Checks multiple locations: Google Drive root, then current directory.
    """
    from pathlib import Path

    secrets_paths = [
        Path.home() / "Google Drive" / "My Drive" / ".secrets.json",  # Local path
        Path("/content/drive/My Drive/.secrets.json"),                # Colab path
        Path('.secrets.json')                                         # Current directory
    ]

    for path in secrets_paths:
        if path.exists():
            print(f"✅ Loading API keys from {path}")
            with open(path, 'r') as f:
                secrets = json.load(f)

            loaded_keys = []
            for key, value in secrets.items():
                if value and value not in ["your-openai-key-here", "your-google-key-here", ""]:
                    os.environ[key] = value
                    loaded_keys.append(key)

            if loaded_keys:
                print(f"   Loaded {len(loaded_keys)} API key(s)")
                return True

    print("⚠️  No .secrets.json found. Make sure API keys are in Colab secrets or environment.")
    return False


def load_training_data(tokenizer, num_samples: int, seed: int):
    """Load or generate training requests and prepare dataset."""
    requests_path = "src/rl/training_requests.json"

    if os.path.exists(requests_path):
        print(f"Loading existing training requests from {requests_path}...")
        with open(requests_path, "r") as f:
            training_requests = json.load(f)
    else:
        from src.rl.generate_training_requests import generate_training_requests
        print(f"Generating {num_samples} new training requests...")
        training_requests = generate_training_requests(
            num_requests=num_samples,
            output_path=requests_path
        )

    print(f"✅ Loaded {len(training_requests)} training requests.")

    # Format prompts
    prompts = [
        format_prompt_with_chat_template(req, tokenizer, add_examples=True)
        for req in training_requests
    ]

    # Sample if needed
    if len(prompts) > num_samples:
        print(f"Sampling {num_samples} requests (seed={seed})...")
        random.seed(seed)
        random_indices = random.sample(range(len(prompts)), num_samples)
        prompts = [prompts[i] for i in random_indices]
        training_requests = [training_requests[i] for i in random_indices]

    return Dataset.from_dict({
        "prompt": prompts,
        "request": training_requests,
    })


print("=" * 80)
print("🚀 STARTING GRPO TRAINING")
print("=" * 80)

# Load API keys
print("\n--- Loading Secrets ---")
load_secrets_from_file()

# Load Model and Tokenizer
print(f"\n--- Loading Model ---")
print(f"Base model: {BASE_MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, padding_side='left')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    use_cache=False,
)
model.gradient_checkpointing_enable()
model.config.pad_token_id = tokenizer.pad_token_id
model.config.padding_side = tokenizer.padding_side
print("✅ Model and tokenizer loaded.")

# Prepare Training Data
print("\n--- Preparing Training Data ---")
train_dataset = load_training_data(tokenizer, num_samples=NUM_SAMPLES, seed=RANDOM_SEED)

# Initialize Reward Function
print("\n--- Initializing Reward Function ---")
reward_fn_instance = ExerciseRewardFunction(
    device="cuda",
    disabled_scorers=DISABLED_SCORERS,
    fluency_use_llm=FLUENCY_USE_LLM,
    concurrency_limit=10  # High concurrency for speed
)
reward_func = create_async_multi_reward(reward_fn_instance, use_openai=True)
print("✅ Reward function ready.")


🚀 STARTING GRPO TRAINING

--- Loading Secrets ---
✅ Loading API keys from /content/drive/My Drive/.secrets.json
   Loaded 9 API key(s)

--- Loading Model ---
Base model: ./models/italian_v8_grpo_round2


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Model and tokenizer loaded.

--- Preparing Training Data ---
Loading existing training requests from src/rl/training_requests.json...
✅ Loaded 2000 training requests.

--- Initializing Reward Function ---
Loading spaCy model: it_core_news_sm...
✅ spaCy model loaded
Reward function will use device: cuda
     ✅ Gemini: 4 API key(s)
     ✅ OpenAI: configured
     ✅ Anthropic: configured
     ✅ Groq: configured
     ✅ DeepSeek: configured
     ✅ Cerebras: configured
  ✅ LLM API Handler initialized
     Providers: gemini, openai, anthropic, groq, deepseek, cerebras
     Total models: 12
Initializing scorers...
  ✅ LLM scoring enabled for cefr_alignment (batch size: 10)
  ✅ LLM scoring enabled for fluency (batch size: 10)
  ✅ LLM scoring enabled for grammar_correctness (batch size: 10)
  ✅ LLM scoring enabled for coherence (batch size: 10)
Loading sentence transformer for topic similarity...
✅ Sentence transformer loaded in cuda
  ✅ LLM topic checking enabled (OpenAI API)
✅ Reward function

In [None]:
# Initialize Trainer
print("\n--- Initializing GRPO Trainer ---")
trainer = GRPOTrainer(
    model=model,
    args=grpo_config,
    reward_funcs=reward_func,
    train_dataset=train_dataset,
    processing_class=tokenizer,
)
print("✅ GRPO Trainer initialized.")

# Start Training
print("\n" + "=" * 80)
print("🔥 TRAINING BEGINS")
print("=" * 80)
trainer.train()
print("\n" + "=" * 80)
print("🎉 TRAINING COMPLETE")
print("=" * 80)

# Save Final Model
print(f"\n--- Saving Model ---")
print(f"Output directory: {OUTPUT_DIR}")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("✅ Model saved successfully.")

The model is already on multiple devices. Skipping the move to device specified in `args`.



--- Initializing GRPO Trainer ---
✅ GRPO Trainer initialized.

🔥 TRAINING BEGINS


`generation_config` default values have been modified to match model-specific defaults: {'max_length': 8192}. If this is not desired, please set these values explicitly.



⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (54 total requests):
      🔵 Gemini: 18/54 (33.3%)
      🟢 Openai: 18/54 (33.3%)
      ⚡ Groq: 18/54 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.4s):
   Grammar   : min=0.0, max=60.0, avg=5.0
   Coherence : min=0.0, max=100.0, avg=53.5
   Topic     : min=6.7, max=100.0, avg=60.5
   Quality   : min=10.0, max=66.7, avg=40.8
   Diversity : min=20.0, max=100.0, avg=76.9
   TOTAL     : min=0.000, max=64.697, avg=24.538





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (45 total requests):
      🔵 Gemini: 15/45 (33.3%)
      🟢 Openai: 15/45 (33.3%)
      ⚡ Groq: 15/45 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.6s):
   Grammar   : min=0.0, max=100.0, avg=53.5
   Coherence : min=0.0, max=100.0, avg=60.8
   Topic     : min=12.0, max=100.0, avg=61.3
   Quality   : min=10.0, max=50.0, avg=25.5
   Diversity : min=12.0, max=80.0, avg=46.5
   TOTAL     : min=0.000, max=73.030, avg=19.037




Step,Training Loss
5,-0.013
10,-0.0302
15,0.0042



⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (45 total requests):
      🔵 Gemini: 15/45 (33.3%)
      🟢 Openai: 15/45 (33.3%)
      ⚡ Groq: 15/45 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.4s):
   Grammar   : min=0.0, max=90.0, avg=21.8
   Coherence : min=0.0, max=100.0, avg=40.3
   Topic     : min=0.0, max=100.0, avg=62.2
   Quality   : min=10.0, max=50.0, avg=34.3
   Diversity : min=28.3, max=100.0, avg=64.3
   TOTAL     : min=0.000, max=65.568, avg=17.582


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (45 total requests):
      🔵 Gemini: 15/45 (33.3%)
      🟢 Openai: 15/45 (33.3%)
      ⚡ Groq: 15/45 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.8s):
   Grammar   : min=0.0, max=80.0, avg=19.2
   Coherence : min=0.0, max=83.3, avg=39.2
   Topic     : min=0.0, max=100.0, avg=73.0
   Quality   : min=10.0, max=90.0, avg=36.0
   Diversity : min=28.3, max=80.0, avg=57.0
   TOTAL     : min=0.000, max=66.364, avg=20.813





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (42 total requests):
      🔵 Gemini: 14/42 (33.3%)
      🟢 Openai: 14/42 (33.3%)
      ⚡ Groq: 14/42 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (32.8s):
   Grammar   : min=0.0, max=100.0, avg=41.0
   Coherence : min=0.0, max=100.0, avg=43.0
   Topic     : min=6.7, max=100.0, avg=59.3
   Quality   : min=9.0, max=50.0, avg=28.0
   Diversity : min=30.0, max=80.0, avg=45.3
   TOTAL     : min=0.000, max=61.212, avg=13.270





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (18 total requests):
      🔵 Gemini: 6/18 (33.3%)
      🟢 Openai: 6/18 (33.3%)
      ⚡ Groq: 6/18 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (20.9s):
   Grammar   : min=0.0, max=84.0, avg=40.3
   Coherence : min=10.0, max=80.0, avg=40.0
   Topic     : min=54.0, max=94.0, avg=77.0
   Quality   : min=10.0, max=50.0, avg=27.3
   Diversity : min=48.0, max=76.0, avg=65.3
   TOTAL     : min=0.000, max=71.909, avg=7.465





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (54 total requests):
      🔵 Gemini: 18/54 (33.3%)
      🟢 Openai: 18/54 (33.3%)
      ⚡ Groq: 18/54 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.9s):
   Grammar   : min=0.0, max=90.0, avg=29.3
   Coherence : min=0.0, max=100.0, avg=42.6
   Topic     : min=20.0, max=100.0, avg=63.4
   Quality   : min=5.0, max=50.0, avg=27.0
   Diversity : min=15.0, max=100.0, avg=52.2
   TOTAL     : min=0.000, max=67.727, avg=19.753





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (54 total requests):
      🔵 Gemini: 18/54 (33.3%)
      🟢 Openai: 18/54 (33.3%)
      ⚡ Groq: 18/54 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (48.5s):
   Grammar   : min=0.0, max=90.0, avg=46.6
   Coherence : min=16.7, max=87.5, avg=49.7
   Topic     : min=20.0, max=100.0, avg=76.9
   Quality   : min=10.0, max=50.0, avg=38.0
   Diversity : min=24.0, max=100.0, avg=67.7
   TOTAL     : min=0.000, max=66.667, avg=25.272


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (45 total requests):
      🔵 Gemini: 15/45 (33.3%)
      🟢 Openai: 15/45 (33.3%)
      ⚡ Groq: 15/45 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.9s):
   Grammar   : min=0.0, max=90.0, avg=18.3
   Coherence : min=0.0, max=100.0, avg=59.2
   Topic     : min=10.0, max=100.0, avg=61.6
   Quality   : min=5.0, max=51.2, avg=34.8
   Diversity : min=20.0, max=100.0, avg=50.7
   TOTAL     : min=0.000, max=65.152, avg=12.851





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (45 total requests):
      🔵 Gemini: 15/45 (33.3%)
      🟢 Openai: 15/45 (33.3%)
      ⚡ Groq: 15/45 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.6s):
   Grammar   : min=0.0, max=92.0, avg=30.3
   Coherence : min=0.0, max=100.0, avg=35.7
   Topic     : min=0.0, max=100.0, avg=70.8
   Quality   : min=10.0, max=50.0, avg=31.5
   Diversity : min=12.0, max=70.0, avg=55.6
   TOTAL     : min=0.000, max=61.455, avg=18.130





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (45 total requests):
      🔵 Gemini: 15/45 (33.3%)
      🟢 Openai: 15/45 (33.3%)
      ⚡ Groq: 15/45 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.6s):
   Grammar   : min=0.0, max=90.0, avg=49.3
   Coherence : min=0.0, max=100.0, avg=60.9
   Topic     : min=13.3, max=100.0, avg=76.8
   Quality   : min=10.0, max=50.0, avg=31.6
   Diversity : min=24.0, max=80.0, avg=52.7
   TOTAL     : min=0.000, max=76.364, avg=18.359


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (42 total requests):
      🔵 Gemini: 14/42 (33.3%)
      🟢 Openai: 14/42 (33.3%)
      ⚡ Groq: 14/42 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.8s):
   Grammar   : min=0.0, max=57.5, avg=15.2
   Coherence : min=0.0, max=100.0, avg=56.2
   Topic     : min=20.0, max=100.0, avg=76.6
   Quality   : min=10.0, max=50.0, avg=39.3
   Diversity : min=15.0, max=80.0, avg=60.0
   TOTAL     : min=0.000, max=64.432, avg=17.541





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (27 total requests):
      🔵 Gemini: 9/27 (33.3%)
      🟢 Openai: 9/27 (33.3%)
      ⚡ Groq: 9/27 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (22.3s):
   Grammar   : min=0.0, max=90.0, avg=37.2
   Coherence : min=0.0, max=75.0, avg=34.7
   Topic     : min=10.0, max=86.7, avg=44.7
   Quality   : min=10.0, max=50.0, avg=25.6
   Diversity : min=35.0, max=100.0, avg=62.2
   TOTAL     : min=0.000, max=70.606, avg=10.872





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (27 total requests):
      🔵 Gemini: 9/27 (33.3%)
      🟢 Openai: 9/27 (33.3%)
      ⚡ Groq: 9/27 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (24.7s):
   Grammar   : min=0.0, max=90.0, avg=57.0
   Coherence : min=0.0, max=100.0, avg=52.7
   Topic     : min=60.0, max=100.0, avg=92.6
   Quality   : min=10.0, max=50.0, avg=28.6
   Diversity : min=53.3, max=88.0, avg=72.0
   TOTAL     : min=0.000, max=84.091, avg=15.478


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      🔵 Gemini: 11/33 (33.3%)
      🟢 Openai: 11/33 (33.3%)
      ⚡ Groq: 11/33 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (26.4s):
   Grammar   : min=0.0, max=60.0, avg=15.6
   Coherence : min=20.0, max=100.0, avg=48.6
   Topic     : min=10.0, max=93.3, avg=56.5
   Quality   : min=10.0, max=51.7, avg=39.8
   Diversity : min=12.0, max=86.7, avg=51.4
   TOTAL     : min=0.000, max=63.182, avg=9.521





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🔵 Gemini: 12/36 (33.3%)
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (29.4s):
   Grammar   : min=0.0, max=90.0, avg=35.8
   Coherence : min=0.0, max=100.0, avg=59.2
   Topic     : min=20.0, max=100.0, avg=77.2
   Quality   : min=8.3, max=66.7, avg=41.7
   Diversity : min=40.0, max=86.7, avg=61.7
   TOTAL     : min=0.000, max=64.848, avg=15.804





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🔵 Gemini: 12/36 (33.3%)
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (28.0s):
   Grammar   : min=0.0, max=90.0, avg=24.6
   Coherence : min=0.0, max=100.0, avg=38.2
   Topic     : min=13.3, max=100.0, avg=62.6
   Quality   : min=10.0, max=50.0, avg=28.6
   Diversity : min=28.3, max=80.0, avg=50.8
   TOTAL     : min=0.000, max=58.295, avg=12.186





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (45 total requests):
      🟢 Openai: 18/45 (40.0%)
      ⚡ Groq: 18/45 (40.0%)
      🔵 Gemini: 9/45 (20.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (69.7s):
   Grammar   : min=0.0, max=90.0, avg=48.1
   Coherence : min=0.0, max=90.0, avg=43.1
   Topic     : min=46.7, max=100.0, avg=83.3
   Quality   : min=5.0, max=50.0, avg=28.0
   Diversity : min=33.3, max=100.0, avg=64.9
   TOTAL     : min=0.000, max=82.182, avg=27.440





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (24 total requests):
      🟢 Openai: 12/24 (50.0%)
      ⚡ Groq: 12/24 (50.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (86.9s):
   Grammar   : min=50.0, max=50.0, avg=50.0
   Coherence : min=0.0, max=83.3, avg=30.5
   Topic     : min=16.0, max=100.0, avg=59.0
   Quality   : min=10.0, max=50.0, avg=32.1
   Diversity : min=15.0, max=100.0, avg=54.2
   TOTAL     : min=0.000, max=68.939, avg=13.411


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (28 total requests):
      ⚡ Groq: 15/28 (53.6%)
      🟢 Openai: 13/28 (46.4%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (96.1s):
   Grammar   : min=50.0, max=50.0, avg=50.0
   Coherence : min=0.0, max=87.5, avg=38.3
   Topic     : min=16.0, max=100.0, avg=61.9
   Quality   : min=10.0, max=50.0, avg=30.2
   Diversity : min=30.0, max=85.0, avg=54.1
   TOTAL     : min=0.000, max=66.727, avg=15.853





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (18 total requests):
      🟢 Openai: 9/18 (50.0%)
      ⚡ Groq: 9/18 (50.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (51.9s):
   Grammar   : min=50.0, max=50.0, avg=50.0
   Coherence : min=0.0, max=100.0, avg=64.8
   Topic     : min=36.7, max=96.7, avg=64.1
   Quality   : min=10.0, max=66.7, avg=38.5
   Diversity : min=30.0, max=80.0, avg=50.0
   TOTAL     : min=0.000, max=47.576, avg=7.294





⏳ Step 1/3: Parsing 36 JSON completions...


                                                    

⏳ Step 2/3: Scoring 36 completions with batched reward function...





   📊 Model Usage Distribution (38 total requests):
      🟢 Openai: 19/38 (50.0%)
      ⚡ Groq: 19/38 (50.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (104.8s):
   Grammar   : min=50.0, max=50.0, avg=50.0
   Coherence : min=0.0, max=100.0, avg=56.2
   Topic     : min=20.0, max=100.0, avg=74.6
   Quality   : min=5.0, max=62.5, avg=33.5
   Diversity : min=25.3, max=86.7, avg=50.5
   TOTAL     : min=0.000, max=79.811, avg=25.124


