# GRPO Training for Italian Exercise Generator

## Setup

In [None]:
# --- Cell 1: Setup and Imports ---

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Navigate to your project directory
# Make sure this path is correct for your Google Drive setup
import os
project_path = '/content/drive/MyDrive/Colab Notebooks/italian_teacher'
os.chdir(project_path)
print(f"Changed directory to: {os.getcwd()}")

# trl imported from local fork
# !pip install -e /content/drive/MyDrive/Colab\ Notebooks/trl

# Install dependencies (now includes google-generativeai for Gemini API)
!pip install -q trl transformers accelerate peft datasets spacy sentence-transformers bitsandbytes json5 openai google-generativeai tqdm nest_asyncio
!python -m spacy download it_core_news_sm
!pip install anthropic groq

# Standard library imports
import json
import random
from getpass import getpass

# Third-party imports
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer

# Local module imports
from src.rl.multi_reward_async import create_async_multi_reward
from src.rl.prompt_formatter import format_prompt_with_chat_template
from src.rl.reward_function import ExerciseRewardFunction

# Environment setup
os.environ["WANDB_DISABLED"] = "true"

print("\n--- Environment Setup ---")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Changed directory to: /content/drive/MyDrive/Colab Notebooks/italian_teacher
Collecting it-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.8.0/it_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.

--- Environment Setup ---
PyTorch version: 2.8.0+cu126
CUDA available: True
GPU: NVIDIA A100-SXM4-80GB


In [None]:
# --- Cell 2: Configuration ---
# All training parameters are here for easy modification.

BASE_MODEL_PATH = "./models/italian_v8_grpo_round2"  # Input model for this training run
OUTPUT_DIR = "./models/TeacherPet_italian_grpo"      # Where the new model will be saved
NUM_SAMPLES = 800                                    # Number of training requests to use
RANDOM_SEED = 44                                      # Seed for reproducibility

# Scorer settings
DISABLED_SCORERS = []          # No scorers disabled
FLUENCY_USE_LLM = False        # Use rule-based checks only (fast, free)

# --- GRPO Configuration ---
grpo_config = GRPOConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=12,  # Reduce from 12
    gradient_accumulation_steps=6,  # Increase to keep same effective batch size
    learning_rate=5e-6,  # Increase from 5e-6
    warmup_steps=50,
    logging_steps=5,
    save_steps=25,
    save_total_limit=3,
    bf16=True,
    remove_unused_columns=False,
    report_to="none",

    # GRPO-specific generation settings
    num_generations=4,  # Keep this
    max_prompt_length=1024,
    max_completion_length=350,
    temperature=0.9,  # Increase for diversity
    generation_batch_size=12,
    beta=0.05,

    # Generation kwargs - CRITICAL FIX
    generation_kwargs={
        "bos_token_id": 128000,
        "do_sample": True,  # ✅ MUST BE TRUE!
        "eos_token_id": [128009, 128001],
        "temperature": 0.8,
        "top_p": 0.9,
        "top_k": 50,
        "padding_side": "left",

    }
)

print("✅ Configuration loaded.")
print(f"   Base model: {BASE_MODEL_PATH}")
print(f"   Output directory: {OUTPUT_DIR}")
print(f"   Training samples: {NUM_SAMPLES}")
print(f"   Max prompt length: 2048 tokens (prevents truncation)")
print(f"   Fluency scorer: {'Enabled (rule-based)' if not DISABLED_SCORERS else 'Disabled'}")

✅ Configuration loaded.
   Base model: ./models/italian_v8_grpo_round2
   Output directory: ./models/TeacherPet_italian_grpo
   Training samples: 1200
   Max prompt length: 2048 tokens (prevents truncation)
   Fluency scorer: Enabled (rule-based)


In [None]:
# --- Cell 3: Helper Functions & Main Execution ---

def load_secrets_from_file():
    """
    Load API keys from .secrets.json file if it exists.
    Checks multiple locations: Google Drive root, then current directory.
    """
    from pathlib import Path

    secrets_paths = [
        Path.home() / "Google Drive" / "My Drive" / ".secrets.json",  # Local path
        Path("/content/drive/My Drive/.secrets.json"),                # Colab path
        Path('.secrets.json')                                         # Current directory
    ]

    for path in secrets_paths:
        if path.exists():
            print(f"✅ Loading API keys from {path}")
            with open(path, 'r') as f:
                secrets = json.load(f)

            loaded_keys = []
            for key, value in secrets.items():
                if value and value not in ["your-openai-key-here", "your-google-key-here", ""]:
                    os.environ[key] = value
                    loaded_keys.append(key)

            if loaded_keys:
                print(f"   Loaded {len(loaded_keys)} API key(s)")
                return True

    print("⚠️  No .secrets.json found. Make sure API keys are in Colab secrets or environment.")
    return False


def load_training_data(tokenizer, num_samples: int, seed: int):
    """Load or generate training requests and prepare dataset."""
    requests_path = "src/rl/training_requests.json"

    if os.path.exists(requests_path):
        print(f"Loading existing training requests from {requests_path}...")
        with open(requests_path, "r") as f:
            training_requests = json.load(f)
    else:
        from src.rl.generate_training_requests import generate_training_requests
        print(f"Generating {num_samples} new training requests...")
        training_requests = generate_training_requests(
            num_requests=num_samples,
            output_path=requests_path
        )

    print(f"✅ Loaded {len(training_requests)} training requests.")

    # Format prompts
    prompts = [
        format_prompt_with_chat_template(req, tokenizer, add_examples=True)
        for req in training_requests
    ]

    # Sample if needed
    if len(prompts) > num_samples:
        print(f"Sampling {num_samples} requests (seed={seed})...")
        random.seed(seed)
        random_indices = random.sample(range(len(prompts)), num_samples)
        prompts = [prompts[i] for i in random_indices]
        training_requests = [training_requests[i] for i in random_indices]

    return Dataset.from_dict({
        "prompt": prompts,
        "request": training_requests,
    })


print("=" * 80)
print("🚀 STARTING GRPO TRAINING")
print("=" * 80)

# Load API keys
print("\n--- Loading Secrets ---")
load_secrets_from_file()

# Load Model and Tokenizer
print(f"\n--- Loading Model ---")
print(f"Base model: {BASE_MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, padding_side='left')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    use_cache=False,
)
model.gradient_checkpointing_enable()
model.config.pad_token_id = tokenizer.pad_token_id
model.config.padding_side = tokenizer.padding_side
print("✅ Model and tokenizer loaded.")

# Prepare Training Data
print("\n--- Preparing Training Data ---")
train_dataset = load_training_data(tokenizer, num_samples=NUM_SAMPLES, seed=RANDOM_SEED)

# Initialize Reward Function
print("\n--- Initializing Reward Function ---")
reward_fn_instance = ExerciseRewardFunction(
    device="cuda",
    disabled_scorers=DISABLED_SCORERS,
    fluency_use_llm=FLUENCY_USE_LLM,
    concurrency_limit=5  # High concurrency for speed
)
reward_func = create_async_multi_reward(reward_fn_instance, use_openai=True)
print("✅ Reward function ready.")

print(f'Tokenizer type: {type(tokenizer)}')
print(f'Padding side: {tokenizer.padding_side}')
if hasattr(tokenizer, 'tokenizer'):
    print(f'Has sub-tokenizer: {tokenizer.tokenizer.padding_side}')


🚀 STARTING GRPO TRAINING

--- Loading Secrets ---
✅ Loading API keys from /content/drive/My Drive/.secrets.json
   Loaded 9 API key(s)

--- Loading Model ---
Base model: ./models/italian_v8_grpo_round2


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Model and tokenizer loaded.

--- Preparing Training Data ---
Loading existing training requests from src/rl/training_requests.json...
✅ Loaded 2000 training requests.
Sampling 1200 requests (seed=44)...

--- Initializing Reward Function ---
Loading spaCy model: it_core_news_sm...
✅ spaCy model loaded
Reward function will use device: cuda
     ✅ Gemini: 4 API key(s)
     ✅ OpenAI: configured
     ✅ Anthropic: configured
     ✅ Groq: configured
     ✅ DeepSeek: configured
     ✅ Cerebras: configured
  ✅ LLM API Handler initialized
     Providers: gemini, openai, anthropic, groq, deepseek, cerebras
     Total models: 12
Initializing scorers...
  ✅ LLM scoring enabled for cefr_alignment (batch size: 10)
  ✅ LLM scoring enabled for fluency (batch size: 10)
  ✅ LLM scoring enabled for grammar_correctness (batch size: 10)
  ✅ LLM scoring enabled for coherence (batch size: 10)
Loading sentence transformer for topic similarity...
✅ Sentence transformer loaded in cuda
  ✅ LLM topic checking en

In [None]:
# --- Add Validation Tracking ---
from src.rl.validation_callback import ValidationCallback, select_validation_samples
from src.rl.prompt_formatter import format_prompt_with_chat_template

# Select 10 diverse validation samples
validation_samples = select_validation_samples(
    training_requests_path="src/rl/training_requests.json",
    num_samples=10,  # Adjust this number (5-15 recommended)
    seed=RANDOM_SEED
)

# Format prompts for validation samples
validation_prompts = [
    format_prompt_with_chat_template(req, tokenizer, add_examples=True)
    for req in validation_samples
]

# Create validation callback
validation_callback = ValidationCallback(
    validation_samples=validation_samples,
    validation_prompts=validation_prompts,
    reward_function=reward_func,
    tokenizer=tokenizer,
    output_dir=OUTPUT_DIR,
    num_generations=3  # Generate 3 completions per sample for comparison
)

print("✅ Validation callback ready.")


📊 Selecting 10 validation samples from src/rl/training_requests.json...
   Total requests available: 2000
   Exercise types: unknown
   ✅ Selected 10 diverse samples
   Distribution: {'unknown': 10}

📊 Validation Callback initialized:
   10 validation samples
   3 generations per sample
   Results will be saved to: models/TeacherPet_italian_grpo/validation_results
✅ Validation callback ready.


In [None]:
import trl
print(f"TRL version: {trl.__version__}")


TRL version: 0.24.0


In [6]:
# Initialize Trainer
print("\n--- Initializing GRPO Trainer ---")
trainer = GRPOTrainer(
    model=model,
    args=grpo_config,
    reward_funcs=reward_func,
    train_dataset=train_dataset,
    processing_class=tokenizer,
    callbacks=[validation_callback],
)

# After: trainer = GRPOTrainer(...)
print(f"\n🔍 GRPO Generation Settings:")
print(f"   num_generations (config): {grpo_config.num_generations}")
print(f"   generation_batch_size: {grpo_config.generation_batch_size}")
if hasattr(trainer, 'generation_config'):
    print(f"   trainer.generation_config: {trainer.generation_config}")
print("✅ GRPO Trainer initialized.")


# Start Training
print("\n" + "=" * 80)
print("🔥 TRAINING BEGINS")
print("=" * 80)
trainer.train()
print("\n" + "=" * 80)
print("🎉 TRAINING COMPLETE")
print("=" * 80)

# Save Final Model
print(f"\n--- Saving Model ---")
print(f"Output directory: {OUTPUT_DIR}")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("✅ Model saved successfully.")

The model is already on multiple devices. Skipping the move to device specified in `args`.



--- Initializing GRPO Trainer ---

🔍 GRPO Generation Settings:
   num_generations (config): 3
   generation_batch_size: 12
   trainer.generation_config: GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128009,
    128001
  ],
  "max_new_tokens": 350,
  "pad_token_id": 128009,
  "padding_side": "left",
  "temperature": 0.8,
  "top_k": null,
  "top_p": 0.9
}

✅ GRPO Trainer initialized.

🔥 TRAINING BEGINS


`generation_config` default values have been modified to match model-specific defaults: {'max_length': 8192}. If this is not desired, please set these values explicitly.


   [Call #1 for step 0: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.9s):
   Grammar   : min=16.7, max=90.0, avg=66.9
   Coherence : min=23.3, max=93.3, avg=52.7
   Topic     : min=0.0, max=100.0, avg=74.6
   Quality   : min=10.0, max=59.0, avg=38.2
   Diversity : min=53.3, max=86.7, avg=68.4
   TOTAL     : min=48.788, max=77.788, avg=61.525




   [Call #2 for step 0: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.2s):
   Grammar   : min=0.0, max=90.0, avg=55.8
   Coherence : min=20.0, max=97.5, avg=54.3
   Topic     : min=20.0, max=100.0, avg=50.9
   Quality   : min=10.0, max=62.5, avg=30.0
   Diversity : min=12.0, max=80.0, avg=33.7
   TOTAL     : min=19.205, max=35.818, avg=25.871




   [Call #3 for step 0: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      🟢 Openai: 10/30 (33.3%)
      ⚡ Groq: 10/30 (33.3%)
      ❓ Deepseek: 10/30 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (35.6s):
   Grammar   : min=0.0, max=90.0, avg=55.0
   Coherence : min=23.3, max=100.0, avg=47.2
   Topic     : min=26.7, max=100.0, avg=67.3
   Quality   : min=10.0, max=50.0, avg=37.1
   Diversity : min=33.3, max=100.0, avg=60.8
   TOTAL     : min=0.000, max=61.515, avg=44.244




   [Call #4 for step 0: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.6s):
   Grammar   : min=0.0, max=90.0, avg=55.6
   Coherence : min=25.0, max=95.0, avg=59.8
   Topic     : min=15.0, max=90.0, avg=44.5
   Quality   : min=10.0, max=50.0, avg=41.0
   Diversity : min=36.0, max=100.0, avg=74.2
   TOTAL     : min=24.773, max=68.409, avg=39.027




   [Call #5 for step 0: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.5s):
   Grammar   : min=0.0, max=100.0, avg=48.5
   Coherence : min=15.0, max=90.0, avg=59.0
   Topic     : min=0.0, max=100.0, avg=71.2
   Quality   : min=10.0, max=50.0, avg=37.0
   Diversity : min=53.3, max=100.0, avg=76.9
   TOTAL     : min=41.061, max=76.591, avg=58.968




   [Call #6 for step 0: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.5s):
   Grammar   : min=0.0, max=90.0, avg=66.2
   Coherence : min=20.0, max=80.0, avg=42.8
   Topic     : min=0.0, max=100.0, avg=61.6
   Quality   : min=10.0, max=52.5, avg=33.8
   Diversity : min=15.0, max=86.7, avg=53.5
   TOTAL     : min=30.682, max=73.864, avg=51.612




Step,Training Loss



🔍 Step 0 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 1: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.1s):
   Grammar   : min=0.0, max=90.0, avg=37.7
   Coherence : min=43.3, max=100.0, avg=68.8
   Topic     : min=40.0, max=96.7, avg=73.8
   Quality   : min=35.0, max=62.5, avg=48.2
   Diversity : min=40.0, max=86.7, avg=62.2
   TOTAL     : min=22.727, max=71.856, avg=45.515




   [Call #2 for step 1: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.6s):
   Grammar   : min=0.0, max=90.0, avg=62.1
   Coherence : min=4.0, max=65.0, avg=35.3
   Topic     : min=20.0, max=100.0, avg=45.2
   Quality   : min=10.0, max=50.0, avg=20.1
   Diversity : min=37.3, max=80.0, avg=75.3
   TOTAL     : min=42.045, max=64.667, avg=49.110




   [Call #3 for step 1: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 17/36 (47.2%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 7/36 (19.4%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.0s):
   Grammar   : min=0.0, max=100.0, avg=61.6
   Coherence : min=16.7, max=60.0, avg=36.3
   Topic     : min=10.0, max=100.0, avg=65.4
   Quality   : min=10.0, max=50.0, avg=38.9
   Diversity : min=40.0, max=86.7, avg=65.4
   TOTAL     : min=47.273, max=69.636, avg=59.593




   [Call #4 for step 1: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 17/36 (47.2%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 7/36 (19.4%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.6s):
   Grammar   : min=0.0, max=90.0, avg=61.9
   Coherence : min=20.0, max=100.0, avg=61.4
   Topic     : min=4.0, max=100.0, avg=65.2
   Quality   : min=10.0, max=60.0, avg=38.9
   Diversity : min=36.0, max=80.0, avg=62.7
   TOTAL     : min=30.091, max=65.152, avg=48.947




   [Call #5 for step 1: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.7s):
   Grammar   : min=45.0, max=90.0, avg=77.4
   Coherence : min=30.0, max=100.0, avg=60.4
   Topic     : min=0.0, max=100.0, avg=58.0
   Quality   : min=8.0, max=52.0, avg=40.1
   Diversity : min=24.0, max=88.0, avg=62.1
   TOTAL     : min=17.000, max=75.341, avg=48.028




   [Call #6 for step 1: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.1s):
   Grammar   : min=0.0, max=90.0, avg=48.0
   Coherence : min=15.0, max=96.0, avg=42.7
   Topic     : min=4.0, max=92.5, avg=51.7
   Quality   : min=26.0, max=62.5, avg=46.5
   Diversity : min=36.0, max=100.0, avg=66.4
   TOTAL     : min=29.727, max=65.152, avg=52.189





🔍 Step 1 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 2: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 19/36 (52.8%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 5/36 (13.9%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (48.0s):
   Grammar   : min=0.0, max=90.0, avg=56.8
   Coherence : min=22.0, max=86.0, avg=54.3
   Topic     : min=22.0, max=100.0, avg=77.5
   Quality   : min=10.0, max=52.0, avg=39.0
   Diversity : min=12.0, max=80.0, avg=50.2
   TOTAL     : min=11.909, max=75.576, avg=50.596




   [Call #2 for step 2: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (47.4s):
   Grammar   : min=30.0, max=90.0, avg=67.7
   Coherence : min=26.0, max=80.0, avg=55.9
   Topic     : min=13.3, max=88.0, avg=66.6
   Quality   : min=10.0, max=50.0, avg=32.1
   Diversity : min=37.3, max=86.7, avg=56.3
   TOTAL     : min=53.030, max=74.424, avg=59.848




   [Call #3 for step 2: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.0s):
   Grammar   : min=22.5, max=72.0, avg=51.9
   Coherence : min=23.3, max=100.0, avg=59.7
   Topic     : min=36.7, max=100.0, avg=73.5
   Quality   : min=23.3, max=66.7, avg=45.3
   Diversity : min=37.3, max=86.7, avg=59.0
   TOTAL     : min=25.455, max=77.758, avg=53.422




   [Call #4 for step 2: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.0s):
   Grammar   : min=0.0, max=90.0, avg=43.3
   Coherence : min=15.0, max=97.5, avg=61.7
   Topic     : min=10.0, max=100.0, avg=72.0
   Quality   : min=30.0, max=50.0, avg=44.0
   Diversity : min=45.0, max=100.0, avg=68.8
   TOTAL     : min=26.705, max=64.886, avg=49.744




   [Call #5 for step 2: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.2s):
   Grammar   : min=0.0, max=90.0, avg=53.9
   Coherence : min=16.7, max=88.0, avg=50.0
   Topic     : min=6.7, max=96.0, avg=56.6
   Quality   : min=10.0, max=60.0, avg=41.6
   Diversity : min=24.0, max=100.0, avg=70.7
   TOTAL     : min=37.636, max=74.485, avg=53.874




   [Call #6 for step 2: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 16/30 (53.3%)
      🟢 Openai: 10/30 (33.3%)
      ❓ Deepseek: 4/30 (13.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.0s):
   Grammar   : min=0.0, max=90.0, avg=51.3
   Coherence : min=34.0, max=76.7, avg=50.2
   Topic     : min=36.0, max=100.0, avg=77.7
   Quality   : min=18.0, max=50.0, avg=37.9
   Diversity : min=40.0, max=100.0, avg=67.2
   TOTAL     : min=0.000, max=75.091, avg=47.121




Step,Training Loss
5,0.009
10,0.0151
15,0.0045
20,0.0093
25,0.0174
30,-0.024
35,-0.0256
40,-0.0117
45,0.005
50,-0.0242



🔍 Step 2 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 3: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 16/30 (53.3%)
      🟢 Openai: 10/30 (33.3%)
      ❓ Deepseek: 4/30 (13.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.6s):
   Grammar   : min=0.0, max=90.0, avg=39.5
   Coherence : min=28.0, max=100.0, avg=54.8
   Topic     : min=15.0, max=100.0, avg=70.5
   Quality   : min=18.0, max=50.0, avg=38.7
   Diversity : min=24.0, max=85.0, avg=62.4
   TOTAL     : min=0.000, max=77.159, avg=39.485




   [Call #2 for step 3: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.4s):
   Grammar   : min=0.0, max=90.0, avg=56.0
   Coherence : min=10.0, max=80.0, avg=49.7
   Topic     : min=10.0, max=100.0, avg=51.1
   Quality   : min=18.8, max=50.0, avg=41.2
   Diversity : min=15.0, max=80.0, avg=54.1
   TOTAL     : min=16.477, max=60.000, avg=48.455




   [Call #3 for step 3: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.2s):
   Grammar   : min=36.0, max=90.0, avg=64.5
   Coherence : min=20.0, max=90.0, avg=48.8
   Topic     : min=12.0, max=100.0, avg=70.5
   Quality   : min=34.0, max=50.0, avg=47.3
   Diversity : min=24.0, max=88.0, avg=63.8
   TOTAL     : min=25.818, max=75.636, avg=55.753




   [Call #4 for step 3: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.2s):
   Grammar   : min=0.0, max=100.0, avg=63.3
   Coherence : min=25.0, max=92.5, avg=58.9
   Topic     : min=10.0, max=100.0, avg=63.4
   Quality   : min=20.0, max=51.2, avg=38.9
   Diversity : min=28.3, max=100.0, avg=66.8
   TOTAL     : min=25.114, max=72.841, avg=53.585




   [Call #5 for step 3: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.8s):
   Grammar   : min=33.3, max=90.0, avg=66.7
   Coherence : min=20.0, max=92.0, avg=55.7
   Topic     : min=10.0, max=100.0, avg=55.9
   Quality   : min=10.0, max=66.7, avg=38.2
   Diversity : min=53.3, max=100.0, avg=73.4
   TOTAL     : min=32.121, max=72.091, avg=54.670




   [Call #6 for step 3: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.3s):
   Grammar   : min=0.0, max=90.0, avg=46.0
   Coherence : min=30.0, max=93.3, avg=61.9
   Topic     : min=37.5, max=100.0, avg=82.7
   Quality   : min=10.0, max=66.7, avg=43.7
   Diversity : min=30.0, max=100.0, avg=65.4
   TOTAL     : min=23.182, max=71.515, avg=52.004





🔍 Step 3 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 4: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (49.9s):
   Grammar   : min=0.0, max=90.0, avg=49.8
   Coherence : min=23.3, max=100.0, avg=49.4
   Topic     : min=10.0, max=98.0, avg=54.6
   Quality   : min=10.0, max=50.0, avg=29.0
   Diversity : min=25.3, max=100.0, avg=68.2
   TOTAL     : min=43.273, max=69.394, avg=56.421




   [Call #2 for step 4: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (34.4s):
   Grammar   : min=0.0, max=90.0, avg=55.8
   Coherence : min=23.3, max=80.0, avg=39.4
   Topic     : min=10.0, max=100.0, avg=64.3
   Quality   : min=10.0, max=50.0, avg=33.3
   Diversity : min=20.0, max=80.0, avg=52.5
   TOTAL     : min=8.409, max=66.667, avg=31.900




   [Call #3 for step 4: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.8s):
   Grammar   : min=0.0, max=90.0, avg=44.2
   Coherence : min=15.0, max=100.0, avg=49.7
   Topic     : min=20.0, max=95.0, avg=62.3
   Quality   : min=10.0, max=66.7, avg=35.9
   Diversity : min=30.0, max=100.0, avg=62.1
   TOTAL     : min=16.591, max=66.856, avg=41.717




   [Call #4 for step 4: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.0s):
   Grammar   : min=0.0, max=90.0, avg=65.0
   Coherence : min=20.0, max=100.0, avg=64.3
   Topic     : min=16.0, max=96.7, avg=44.2
   Quality   : min=10.0, max=100.0, avg=45.0
   Diversity : min=20.0, max=80.0, avg=55.2
   TOTAL     : min=9.697, max=76.091, avg=44.683




   [Call #5 for step 4: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (50.3s):
   Grammar   : min=0.0, max=90.0, avg=58.8
   Coherence : min=30.0, max=100.0, avg=61.3
   Topic     : min=0.0, max=100.0, avg=62.3
   Quality   : min=10.0, max=62.5, avg=39.9
   Diversity : min=28.3, max=86.7, avg=70.5
   TOTAL     : min=47.121, max=78.152, avg=61.436




   [Call #6 for step 4: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.3s):
   Grammar   : min=25.0, max=90.0, avg=66.7
   Coherence : min=22.0, max=86.0, avg=48.8
   Topic     : min=10.0, max=97.5, avg=55.0
   Quality   : min=8.8, max=50.0, avg=27.1
   Diversity : min=12.0, max=80.0, avg=47.2
   TOTAL     : min=30.727, max=58.788, avg=46.938





🔍 Step 4 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 5: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (51.5s):
   Grammar   : min=0.0, max=90.0, avg=30.5
   Coherence : min=25.0, max=85.0, avg=58.1
   Topic     : min=27.5, max=96.0, avg=67.3
   Quality   : min=34.0, max=50.0, avg=43.3
   Diversity : min=15.0, max=100.0, avg=51.2
   TOTAL     : min=17.273, max=78.273, avg=35.212




   [Call #2 for step 5: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 15/33 (45.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 7/33 (21.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.7s):
   Grammar   : min=0.0, max=100.0, avg=50.4
   Coherence : min=20.0, max=86.0, avg=44.5
   Topic     : min=20.0, max=100.0, avg=62.7
   Quality   : min=5.0, max=50.0, avg=24.3
   Diversity : min=37.3, max=76.0, avg=60.4
   TOTAL     : min=0.000, max=65.455, avg=43.997




   [Call #3 for step 5: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.1s):
   Grammar   : min=0.0, max=90.0, avg=43.3
   Coherence : min=24.0, max=84.0, avg=55.2
   Topic     : min=0.0, max=100.0, avg=61.4
   Quality   : min=10.0, max=60.0, avg=33.9
   Diversity : min=25.3, max=80.0, avg=56.4
   TOTAL     : min=43.864, max=77.424, avg=55.715




   [Call #4 for step 5: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 15/33 (45.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 7/33 (21.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.0s):
   Grammar   : min=18.0, max=90.0, avg=64.0
   Coherence : min=20.0, max=100.0, avg=52.2
   Topic     : min=10.0, max=100.0, avg=61.1
   Quality   : min=18.0, max=50.0, avg=37.9
   Diversity : min=28.3, max=73.3, avg=48.5
   TOTAL     : min=0.000, max=69.273, avg=47.422




   [Call #5 for step 5: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.2s):
   Grammar   : min=0.0, max=100.0, avg=60.1
   Coherence : min=20.0, max=85.0, avg=53.0
   Topic     : min=20.0, max=100.0, avg=67.7
   Quality   : min=18.0, max=100.0, avg=43.4
   Diversity : min=36.0, max=100.0, avg=68.9
   TOTAL     : min=13.273, max=74.545, avg=43.311




   [Call #6 for step 5: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.4s):
   Grammar   : min=0.0, max=90.0, avg=58.9
   Coherence : min=36.7, max=93.3, avg=61.1
   Topic     : min=5.0, max=100.0, avg=61.9
   Quality   : min=10.0, max=66.7, avg=40.9
   Diversity : min=20.0, max=80.0, avg=58.2
   TOTAL     : min=15.455, max=61.742, avg=33.182





🔍 Step 5 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 6: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.6s):
   Grammar   : min=16.7, max=90.0, avg=65.4
   Coherence : min=30.0, max=100.0, avg=56.2
   Topic     : min=13.3, max=100.0, avg=72.9
   Quality   : min=10.0, max=66.7, avg=32.8
   Diversity : min=20.0, max=100.0, avg=62.8
   TOTAL     : min=11.818, max=63.788, avg=42.178




   [Call #2 for step 6: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.2s):
   Grammar   : min=0.0, max=100.0, avg=53.6
   Coherence : min=10.0, max=87.5, avg=38.8
   Topic     : min=20.0, max=100.0, avg=71.2
   Quality   : min=10.0, max=50.0, avg=27.1
   Diversity : min=25.3, max=80.0, avg=53.2
   TOTAL     : min=46.667, max=61.576, avg=53.485




   [Call #3 for step 6: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.4s):
   Grammar   : min=0.0, max=90.0, avg=52.0
   Coherence : min=34.0, max=95.0, avg=57.5
   Topic     : min=20.0, max=100.0, avg=82.7
   Quality   : min=18.0, max=62.5, avg=39.3
   Diversity : min=15.0, max=100.0, avg=69.2
   TOTAL     : min=24.000, max=68.636, avg=45.598




   [Call #4 for step 6: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.7s):
   Grammar   : min=22.5, max=90.0, avg=74.6
   Coherence : min=20.0, max=100.0, avg=63.5
   Topic     : min=0.0, max=100.0, avg=61.0
   Quality   : min=10.0, max=62.5, avg=40.5
   Diversity : min=30.0, max=100.0, avg=57.1
   TOTAL     : min=19.697, max=72.159, avg=36.256




   [Call #5 for step 6: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.9s):
   Grammar   : min=0.0, max=90.0, avg=57.1
   Coherence : min=20.0, max=80.0, avg=53.3
   Topic     : min=0.0, max=100.0, avg=53.9
   Quality   : min=10.0, max=66.7, avg=38.4
   Diversity : min=53.3, max=86.7, avg=70.4
   TOTAL     : min=7.727, max=65.758, avg=46.480




   [Call #6 for step 6: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (35.4s):
   Grammar   : min=0.0, max=90.0, avg=50.5
   Coherence : min=23.3, max=100.0, avg=44.5
   Topic     : min=6.7, max=100.0, avg=51.3
   Quality   : min=10.0, max=100.0, avg=30.9
   Diversity : min=30.0, max=100.0, avg=70.0
   TOTAL     : min=24.545, max=59.394, avg=44.580





🔍 Step 6 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 7: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.4s):
   Grammar   : min=22.5, max=90.0, avg=65.1
   Coherence : min=20.0, max=80.0, avg=47.9
   Topic     : min=40.0, max=100.0, avg=75.6
   Quality   : min=10.0, max=52.5, avg=38.1
   Diversity : min=30.0, max=86.7, avg=49.4
   TOTAL     : min=21.818, max=70.189, avg=44.881




   [Call #2 for step 7: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.4s):
   Grammar   : min=0.0, max=90.0, avg=53.3
   Coherence : min=20.0, max=87.5, avg=55.6
   Topic     : min=20.0, max=100.0, avg=66.3
   Quality   : min=10.0, max=50.0, avg=32.3
   Diversity : min=15.0, max=88.0, avg=60.2
   TOTAL     : min=19.545, max=70.000, avg=45.426




   [Call #3 for step 7: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.2s):
   Grammar   : min=0.0, max=90.0, avg=51.0
   Coherence : min=25.0, max=88.0, avg=58.5
   Topic     : min=8.0, max=100.0, avg=61.6
   Quality   : min=10.0, max=60.0, avg=36.3
   Diversity : min=33.3, max=86.7, avg=59.2
   TOTAL     : min=20.568, max=73.242, avg=52.218




   [Call #4 for step 7: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.3s):
   Grammar   : min=0.0, max=90.0, avg=62.1
   Coherence : min=20.0, max=100.0, avg=62.4
   Topic     : min=40.0, max=97.5, avg=73.1
   Quality   : min=8.8, max=62.5, avg=34.2
   Diversity : min=28.3, max=80.0, avg=60.7
   TOTAL     : min=47.235, max=71.970, avg=59.094




   [Call #5 for step 7: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      🟢 Openai: 11/33 (33.3%)
      ⚡ Groq: 11/33 (33.3%)
      ❓ Deepseek: 11/33 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.3s):
   Grammar   : min=0.0, max=100.0, avg=61.1
   Coherence : min=20.0, max=60.0, avg=34.1
   Topic     : min=10.0, max=86.7, avg=48.6
   Quality   : min=10.0, max=50.0, avg=34.2
   Diversity : min=20.0, max=100.0, avg=49.1
   TOTAL     : min=0.000, max=36.667, avg=23.412




   [Call #6 for step 7: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.7s):
   Grammar   : min=37.5, max=100.0, avg=74.8
   Coherence : min=22.0, max=100.0, avg=54.2
   Topic     : min=0.0, max=94.0, avg=47.1
   Quality   : min=10.0, max=50.0, avg=38.7
   Diversity : min=30.0, max=100.0, avg=56.3
   TOTAL     : min=16.364, max=70.455, avg=42.636





🔍 Step 7 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 8: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 15/33 (45.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 7/33 (21.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.8s):
   Grammar   : min=0.0, max=90.0, avg=57.7
   Coherence : min=20.0, max=88.0, avg=58.3
   Topic     : min=20.0, max=90.0, avg=53.8
   Quality   : min=10.0, max=50.0, avg=44.1
   Diversity : min=50.0, max=80.0, avg=71.7
   TOTAL     : min=0.000, max=79.000, avg=56.816




   [Call #2 for step 8: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.9s):
   Grammar   : min=0.0, max=90.0, avg=61.3
   Coherence : min=20.0, max=100.0, avg=57.9
   Topic     : min=0.0, max=100.0, avg=48.6
   Quality   : min=10.0, max=50.0, avg=35.3
   Diversity : min=20.0, max=100.0, avg=65.8
   TOTAL     : min=23.485, max=72.182, avg=41.596




   [Call #3 for step 8: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.9s):
   Grammar   : min=0.0, max=90.0, avg=52.0
   Coherence : min=20.0, max=94.0, avg=61.9
   Topic     : min=37.5, max=100.0, avg=77.0
   Quality   : min=10.0, max=66.7, avg=43.9
   Diversity : min=40.0, max=100.0, avg=62.9
   TOTAL     : min=17.576, max=66.364, avg=45.411




   [Call #4 for step 8: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 17/36 (47.2%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 7/36 (19.4%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (50.6s):
   Grammar   : min=0.0, max=90.0, avg=55.9
   Coherence : min=22.0, max=84.0, avg=53.5
   Topic     : min=0.0, max=100.0, avg=57.7
   Quality   : min=9.0, max=52.0, avg=34.1
   Diversity : min=32.0, max=100.0, avg=64.4
   TOTAL     : min=17.182, max=67.614, avg=43.742




   [Call #5 for step 8: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.6s):
   Grammar   : min=0.0, max=90.0, avg=48.8
   Coherence : min=36.7, max=100.0, avg=67.7
   Topic     : min=8.0, max=100.0, avg=65.0
   Quality   : min=10.0, max=66.7, avg=36.7
   Diversity : min=25.3, max=88.0, avg=70.6
   TOTAL     : min=42.848, max=69.636, avg=60.832




   [Call #6 for step 8: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.1s):
   Grammar   : min=30.0, max=90.0, avg=66.8
   Coherence : min=22.0, max=84.0, avg=46.1
   Topic     : min=12.0, max=100.0, avg=78.7
   Quality   : min=10.0, max=53.3, avg=32.7
   Diversity : min=25.3, max=100.0, avg=55.8
   TOTAL     : min=28.182, max=73.394, avg=48.003





🔍 Step 8 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 9: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 16/33 (48.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 6/33 (18.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.2s):
   Grammar   : min=0.0, max=90.0, avg=66.9
   Coherence : min=18.0, max=97.5, avg=63.5
   Topic     : min=12.0, max=95.0, avg=61.3
   Quality   : min=18.0, max=62.5, avg=48.6
   Diversity : min=12.0, max=100.0, avg=55.2
   TOTAL     : min=0.000, max=77.455, avg=45.254




   [Call #2 for step 9: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.0s):
   Grammar   : min=0.0, max=100.0, avg=52.5
   Coherence : min=13.3, max=92.5, avg=43.5
   Topic     : min=15.0, max=100.0, avg=63.8
   Quality   : min=10.0, max=52.5, avg=32.0
   Diversity : min=15.0, max=80.0, avg=47.1
   TOTAL     : min=10.795, max=60.000, avg=38.384




   [Call #3 for step 9: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 17/36 (47.2%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 7/36 (19.4%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (48.3s):
   Grammar   : min=0.0, max=90.0, avg=46.8
   Coherence : min=24.0, max=80.0, avg=51.1
   Topic     : min=45.0, max=100.0, avg=82.6
   Quality   : min=10.0, max=62.5, avg=39.5
   Diversity : min=15.0, max=80.0, avg=63.1
   TOTAL     : min=15.758, max=70.682, avg=44.922




   [Call #4 for step 9: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 15/33 (45.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 7/33 (21.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.7s):
   Grammar   : min=0.0, max=90.0, avg=60.5
   Coherence : min=34.0, max=94.0, avg=65.3
   Topic     : min=0.0, max=100.0, avg=52.6
   Quality   : min=20.0, max=58.8, avg=41.5
   Diversity : min=12.0, max=86.7, avg=52.3
   TOTAL     : min=0.000, max=68.242, avg=35.985




   [Call #5 for step 9: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (48.8s):
   Grammar   : min=0.0, max=90.0, avg=61.9
   Coherence : min=25.0, max=90.0, avg=62.7
   Topic     : min=32.5, max=100.0, avg=79.3
   Quality   : min=10.0, max=62.5, avg=35.2
   Diversity : min=30.0, max=100.0, avg=58.9
   TOTAL     : min=23.409, max=75.939, avg=54.281




   [Call #6 for step 9: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 18/36 (50.0%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 6/36 (16.7%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (49.1s):
   Grammar   : min=45.0, max=90.0, avg=65.7
   Coherence : min=30.0, max=94.0, avg=49.8
   Topic     : min=27.5, max=94.0, avg=62.8
   Quality   : min=18.0, max=62.5, avg=38.0
   Diversity : min=36.0, max=100.0, avg=71.2
   TOTAL     : min=29.273, max=75.606, avg=47.355





🔍 Step 9 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 10: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.7s):
   Grammar   : min=50.0, max=90.0, avg=69.8
   Coherence : min=40.0, max=100.0, avg=68.8
   Topic     : min=0.0, max=100.0, avg=75.1
   Quality   : min=10.0, max=60.0, avg=42.9
   Diversity : min=30.0, max=86.7, avg=62.7
   TOTAL     : min=27.727, max=78.970, avg=51.164




   [Call #2 for step 10: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.5s):
   Grammar   : min=0.0, max=90.0, avg=56.8
   Coherence : min=23.3, max=100.0, avg=48.5
   Topic     : min=53.3, max=100.0, avg=80.7
   Quality   : min=10.0, max=66.7, avg=33.2
   Diversity : min=40.0, max=86.7, avg=64.8
   TOTAL     : min=21.818, max=75.818, avg=51.379




   [Call #3 for step 10: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 15/33 (45.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 7/33 (21.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.1s):
   Grammar   : min=18.0, max=90.0, avg=57.3
   Coherence : min=22.5, max=92.0, avg=57.3
   Topic     : min=15.0, max=90.0, avg=54.0
   Quality   : min=30.0, max=60.0, avg=41.0
   Diversity : min=24.0, max=100.0, avg=66.3
   TOTAL     : min=0.000, max=49.091, avg=28.403




   [Call #4 for step 10: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.4s):
   Grammar   : min=22.5, max=90.0, avg=68.9
   Coherence : min=34.0, max=100.0, avg=71.5
   Topic     : min=35.0, max=100.0, avg=81.5
   Quality   : min=10.0, max=50.0, avg=36.7
   Diversity : min=15.0, max=60.0, avg=40.3
   TOTAL     : min=22.500, max=79.508, avg=43.655




   [Call #5 for step 10: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 15/33 (45.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 7/33 (21.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.9s):
   Grammar   : min=18.0, max=90.0, avg=53.2
   Coherence : min=20.0, max=95.0, avg=61.4
   Topic     : min=20.0, max=100.0, avg=54.8
   Quality   : min=7.0, max=62.5, avg=39.5
   Diversity : min=36.0, max=100.0, avg=62.1
   TOTAL     : min=0.000, max=71.061, avg=50.818




   [Call #6 for step 10: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.7s):
   Grammar   : min=30.0, max=100.0, avg=62.8
   Coherence : min=23.3, max=84.0, avg=54.3
   Topic     : min=13.3, max=100.0, avg=62.7
   Quality   : min=10.0, max=60.0, avg=37.7
   Diversity : min=12.0, max=100.0, avg=62.6
   TOTAL     : min=23.333, max=65.265, avg=44.345





🔍 Step 10 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 11: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (47.1s):
   Grammar   : min=30.0, max=90.0, avg=65.2
   Coherence : min=22.0, max=100.0, avg=61.4
   Topic     : min=20.0, max=100.0, avg=64.7
   Quality   : min=20.0, max=66.7, avg=44.5
   Diversity : min=15.0, max=86.7, avg=48.4
   TOTAL     : min=14.773, max=70.727, avg=44.967




   [Call #2 for step 11: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.0s):
   Grammar   : min=0.0, max=90.0, avg=53.5
   Coherence : min=20.0, max=80.0, avg=43.2
   Topic     : min=66.7, max=100.0, avg=92.6
   Quality   : min=10.0, max=63.3, avg=31.7
   Diversity : min=12.0, max=100.0, avg=61.1
   TOTAL     : min=17.455, max=76.455, avg=51.735




   [Call #3 for step 11: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.0s):
   Grammar   : min=0.0, max=90.0, avg=69.4
   Coherence : min=27.5, max=100.0, avg=75.9
   Topic     : min=66.0, max=100.0, avg=89.3
   Quality   : min=10.0, max=100.0, avg=50.9
   Diversity : min=30.0, max=100.0, avg=62.2
   TOTAL     : min=38.182, max=71.667, avg=55.660




   [Call #4 for step 11: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 13/30 (43.3%)
      🟢 Openai: 10/30 (33.3%)
      ❓ Deepseek: 7/30 (23.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (35.6s):
   Grammar   : min=56.0, max=90.0, avg=79.1
   Coherence : min=20.0, max=84.0, avg=54.0
   Topic     : min=15.0, max=100.0, avg=65.3
   Quality   : min=10.0, max=60.0, avg=44.1
   Diversity : min=30.0, max=86.7, avg=53.8
   TOTAL     : min=0.000, max=76.424, avg=34.836




   [Call #5 for step 11: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 17/33 (51.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 5/33 (15.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (48.3s):
   Grammar   : min=40.0, max=90.0, avg=64.4
   Coherence : min=36.0, max=96.0, avg=60.3
   Topic     : min=0.0, max=80.0, avg=21.5
   Quality   : min=20.0, max=70.0, avg=45.3
   Diversity : min=32.0, max=100.0, avg=84.2
   TOTAL     : min=0.000, max=79.364, avg=58.033




   [Call #6 for step 11: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.8s):
   Grammar   : min=45.0, max=90.0, avg=76.5
   Coherence : min=45.0, max=100.0, avg=73.3
   Topic     : min=12.0, max=76.7, avg=44.9
   Quality   : min=22.5, max=62.5, avg=38.5
   Diversity : min=40.0, max=100.0, avg=73.5
   TOTAL     : min=53.970, max=71.629, avg=60.792





🔍 Step 11 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 12: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 19/33 (57.6%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 3/33 (9.1%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.2s):
   Grammar   : min=36.0, max=90.0, avg=60.2
   Coherence : min=26.0, max=100.0, avg=73.8
   Topic     : min=48.0, max=100.0, avg=85.3
   Quality   : min=10.0, max=60.0, avg=45.0
   Diversity : min=62.7, max=100.0, avg=80.1
   TOTAL     : min=0.000, max=85.242, avg=66.040




   [Call #2 for step 12: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.5s):
   Grammar   : min=22.5, max=90.0, avg=57.5
   Coherence : min=27.5, max=92.5, avg=51.0
   Topic     : min=32.5, max=100.0, avg=76.1
   Quality   : min=10.0, max=66.7, avg=42.1
   Diversity : min=24.0, max=86.7, avg=65.3
   TOTAL     : min=28.455, max=68.333, avg=50.883




   [Call #3 for step 12: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.7s):
   Grammar   : min=22.5, max=90.0, avg=69.9
   Coherence : min=30.0, max=96.7, avg=63.8
   Topic     : min=30.0, max=100.0, avg=64.2
   Quality   : min=35.0, max=62.5, avg=45.0
   Diversity : min=25.3, max=86.7, avg=57.5
   TOTAL     : min=17.727, max=73.970, avg=54.021




   [Call #4 for step 12: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.2s):
   Grammar   : min=0.0, max=90.0, avg=54.7
   Coherence : min=20.0, max=100.0, avg=59.2
   Topic     : min=70.0, max=100.0, avg=87.5
   Quality   : min=10.0, max=60.0, avg=39.6
   Diversity : min=30.0, max=86.7, avg=64.1
   TOTAL     : min=30.000, max=70.606, avg=54.689




   [Call #5 for step 12: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 13/33 (39.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 9/33 (27.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.5s):
   Grammar   : min=0.0, max=90.0, avg=54.4
   Coherence : min=27.5, max=82.5, avg=59.9
   Topic     : min=43.3, max=100.0, avg=78.5
   Quality   : min=30.0, max=66.7, avg=50.3
   Diversity : min=58.3, max=100.0, avg=80.9
   TOTAL     : min=0.000, max=80.061, avg=58.277




   [Call #6 for step 12: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.1s):
   Grammar   : min=0.0, max=90.0, avg=57.8
   Coherence : min=32.0, max=96.0, avg=61.9
   Topic     : min=8.0, max=100.0, avg=68.3
   Quality   : min=10.0, max=66.7, avg=45.4
   Diversity : min=48.0, max=100.0, avg=72.2
   TOTAL     : min=21.818, max=84.000, avg=51.477





🔍 Step 12 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 13: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.6s):
   Grammar   : min=0.0, max=90.0, avg=41.7
   Coherence : min=22.0, max=85.0, avg=56.3
   Topic     : min=5.0, max=96.7, avg=54.5
   Quality   : min=10.0, max=66.7, avg=49.8
   Diversity : min=12.0, max=100.0, avg=64.2
   TOTAL     : min=13.909, max=66.818, avg=43.136




   [Call #2 for step 13: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.8s):
   Grammar   : min=30.0, max=70.0, avg=49.1
   Coherence : min=35.0, max=93.3, avg=64.6
   Topic     : min=34.0, max=100.0, avg=82.2
   Quality   : min=30.0, max=66.7, avg=50.1
   Diversity : min=60.0, max=100.0, avg=78.7
   TOTAL     : min=36.667, max=70.424, avg=57.455




   [Call #3 for step 13: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      🟢 Openai: 11/33 (33.3%)
      ⚡ Groq: 11/33 (33.3%)
      ❓ Deepseek: 11/33 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.7s):
   Grammar   : min=30.0, max=90.0, avg=56.9
   Coherence : min=23.3, max=100.0, avg=68.8
   Topic     : min=20.0, max=100.0, avg=69.2
   Quality   : min=23.3, max=66.7, avg=49.3
   Diversity : min=60.0, max=86.7, avg=74.1
   TOTAL     : min=0.000, max=69.242, avg=55.980




   [Call #4 for step 13: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 20/36 (55.6%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 4/36 (11.1%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (50.2s):
   Grammar   : min=0.0, max=82.0, avg=53.0
   Coherence : min=50.0, max=80.0, avg=67.3
   Topic     : min=20.0, max=100.0, avg=70.4
   Quality   : min=34.0, max=62.5, avg=53.0
   Diversity : min=24.0, max=100.0, avg=68.6
   TOTAL     : min=18.182, max=81.182, avg=56.913




   [Call #5 for step 13: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 17/36 (47.2%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 7/36 (19.4%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.0s):
   Grammar   : min=0.0, max=100.0, avg=44.1
   Coherence : min=15.0, max=100.0, avg=47.9
   Topic     : min=40.0, max=100.0, avg=78.2
   Quality   : min=10.0, max=75.0, avg=43.4
   Diversity : min=30.0, max=100.0, avg=72.0
   TOTAL     : min=17.955, max=73.091, avg=53.470




   [Call #6 for step 13: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 15/33 (45.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 7/33 (21.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.9s):
   Grammar   : min=0.0, max=90.0, avg=47.9
   Coherence : min=28.0, max=100.0, avg=62.9
   Topic     : min=8.0, max=100.0, avg=61.0
   Quality   : min=18.0, max=75.0, avg=49.3
   Diversity : min=30.0, max=100.0, avg=76.8
   TOTAL     : min=0.000, max=81.288, avg=47.544





🔍 Step 13 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 14: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.2s):
   Grammar   : min=46.7, max=100.0, avg=69.0
   Coherence : min=22.0, max=100.0, avg=67.5
   Topic     : min=36.7, max=98.0, avg=76.2
   Quality   : min=10.0, max=70.0, avg=49.9
   Diversity : min=56.0, max=100.0, avg=82.0
   TOTAL     : min=51.636, max=82.545, avg=65.727




   [Call #2 for step 14: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 17/33 (51.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 5/33 (15.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (47.7s):
   Grammar   : min=0.0, max=90.0, avg=55.8
   Coherence : min=30.0, max=96.0, avg=61.0
   Topic     : min=8.0, max=100.0, avg=60.2
   Quality   : min=28.0, max=100.0, avg=52.5
   Diversity : min=48.0, max=100.0, avg=76.4
   TOTAL     : min=0.000, max=66.364, avg=44.707




   [Call #3 for step 14: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 14/30 (46.7%)
      🟢 Openai: 10/30 (33.3%)
      ❓ Deepseek: 6/30 (20.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.0s):
   Grammar   : min=10.0, max=100.0, avg=63.3
   Coherence : min=40.0, max=90.0, avg=65.6
   Topic     : min=20.0, max=95.0, avg=69.0
   Quality   : min=30.0, max=60.0, avg=51.3
   Diversity : min=30.0, max=100.0, avg=79.5
   TOTAL     : min=0.000, max=77.152, avg=47.572




   [Call #4 for step 14: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.8s):
   Grammar   : min=18.0, max=90.0, avg=62.2
   Coherence : min=20.0, max=100.0, avg=61.2
   Topic     : min=0.0, max=100.0, avg=65.3
   Quality   : min=10.0, max=66.7, avg=47.2
   Diversity : min=30.0, max=100.0, avg=69.9
   TOTAL     : min=15.909, max=85.000, avg=51.912




   [Call #5 for step 14: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.7s):
   Grammar   : min=0.0, max=67.5, avg=46.8
   Coherence : min=32.5, max=100.0, avg=56.1
   Topic     : min=20.0, max=100.0, avg=58.2
   Quality   : min=32.5, max=75.0, avg=53.4
   Diversity : min=56.7, max=100.0, avg=84.1
   TOTAL     : min=53.788, max=75.818, avg=62.835




   [Call #6 for step 14: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 17/33 (51.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 5/33 (15.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (47.4s):
   Grammar   : min=36.0, max=90.0, avg=61.5
   Coherence : min=30.0, max=94.0, avg=59.2
   Topic     : min=52.0, max=98.0, avg=84.1
   Quality   : min=26.0, max=62.0, avg=47.4
   Diversity : min=50.7, max=100.0, avg=80.6
   TOTAL     : min=0.000, max=76.455, avg=56.280





🔍 Step 14 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 15: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 16/33 (48.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 6/33 (18.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (47.2s):
   Grammar   : min=0.0, max=82.0, avg=60.1
   Coherence : min=36.7, max=93.3, avg=61.9
   Topic     : min=48.0, max=100.0, avg=85.6
   Quality   : min=32.5, max=66.7, avg=51.5
   Diversity : min=66.7, max=100.0, avg=80.8
   TOTAL     : min=0.000, max=79.818, avg=54.737




   [Call #2 for step 15: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.6s):
   Grammar   : min=0.0, max=100.0, avg=53.9
   Coherence : min=20.0, max=100.0, avg=58.0
   Topic     : min=80.0, max=100.0, avg=95.2
   Quality   : min=30.0, max=60.0, avg=46.7
   Diversity : min=30.0, max=100.0, avg=74.3
   TOTAL     : min=26.591, max=76.545, avg=57.674




   [Call #3 for step 15: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (55.9s):
   Grammar   : min=22.5, max=90.0, avg=62.0
   Coherence : min=35.0, max=94.0, avg=58.3
   Topic     : min=65.0, max=95.0, avg=83.2
   Quality   : min=31.2, max=62.5, avg=47.1
   Diversity : min=45.0, max=100.0, avg=65.7
   TOTAL     : min=27.500, max=82.424, avg=53.821




   [Call #4 for step 15: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.4s):
   Grammar   : min=0.0, max=90.0, avg=41.2
   Coherence : min=20.0, max=80.0, avg=51.5
   Topic     : min=4.0, max=97.5, avg=60.0
   Quality   : min=10.0, max=62.5, avg=43.6
   Diversity : min=30.0, max=100.0, avg=72.8
   TOTAL     : min=25.455, max=71.818, avg=52.510




   [Call #5 for step 15: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.8s):
   Grammar   : min=0.0, max=90.0, avg=54.4
   Coherence : min=27.5, max=100.0, avg=64.2
   Topic     : min=0.0, max=80.0, avg=34.8
   Quality   : min=32.5, max=75.0, avg=58.1
   Diversity : min=30.0, max=100.0, avg=65.6
   TOTAL     : min=26.818, max=74.091, avg=41.802




   [Call #6 for step 15: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (48.6s):
   Grammar   : min=45.0, max=90.0, avg=59.2
   Coherence : min=20.0, max=82.0, avg=59.5
   Topic     : min=15.0, max=100.0, avg=65.0
   Quality   : min=10.0, max=62.5, avg=48.3
   Diversity : min=45.0, max=100.0, avg=78.9
   TOTAL     : min=29.545, max=68.000, avg=54.142





🔍 Step 15 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 16: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (47.5s):
   Grammar   : min=0.0, max=90.0, avg=46.4
   Coherence : min=35.0, max=92.5, avg=49.9
   Topic     : min=12.0, max=100.0, avg=64.9
   Quality   : min=27.0, max=73.8, avg=51.7
   Diversity : min=60.0, max=100.0, avg=78.4
   TOTAL     : min=21.636, max=68.788, avg=46.721




   [Call #2 for step 16: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (51.2s):
   Grammar   : min=22.5, max=90.0, avg=69.5
   Coherence : min=30.0, max=84.0, avg=57.6
   Topic     : min=58.0, max=100.0, avg=85.8
   Quality   : min=28.0, max=66.7, avg=52.3
   Diversity : min=24.0, max=100.0, avg=79.3
   TOTAL     : min=32.182, max=74.545, avg=56.104




   [Call #3 for step 16: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.9s):
   Grammar   : min=0.0, max=90.0, avg=43.2
   Coherence : min=26.7, max=90.0, avg=54.3
   Topic     : min=0.0, max=100.0, avg=57.9
   Quality   : min=10.0, max=65.0, avg=48.4
   Diversity : min=60.0, max=100.0, avg=89.6
   TOTAL     : min=23.091, max=63.750, avg=50.817




   [Call #4 for step 16: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.3s):
   Grammar   : min=0.0, max=72.0, avg=40.2
   Coherence : min=35.0, max=96.7, avg=69.4
   Topic     : min=0.0, max=100.0, avg=50.9
   Quality   : min=28.7, max=70.0, avg=49.5
   Diversity : min=60.0, max=100.0, avg=83.3
   TOTAL     : min=17.727, max=63.636, avg=45.753




   [Call #5 for step 16: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 18/33 (54.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 4/33 (12.1%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.3s):
   Grammar   : min=30.0, max=80.0, avg=50.9
   Coherence : min=36.0, max=94.0, avg=59.9
   Topic     : min=12.0, max=100.0, avg=56.3
   Quality   : min=36.7, max=75.0, avg=59.0
   Diversity : min=70.0, max=100.0, avg=89.3
   TOTAL     : min=0.000, max=70.455, avg=51.302




   [Call #6 for step 16: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.1s):
   Grammar   : min=0.0, max=90.0, avg=54.1
   Coherence : min=34.0, max=100.0, avg=60.2
   Topic     : min=0.0, max=100.0, avg=65.1
   Quality   : min=38.0, max=75.0, avg=59.5
   Diversity : min=44.0, max=100.0, avg=73.2
   TOTAL     : min=34.886, max=56.000, avg=44.676





🔍 Step 16 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 17: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (48.8s):
   Grammar   : min=22.5, max=80.0, avg=56.0
   Coherence : min=40.0, max=100.0, avg=69.2
   Topic     : min=6.7, max=100.0, avg=60.4
   Quality   : min=53.3, max=75.0, avg=64.9
   Diversity : min=64.0, max=100.0, avg=93.3
   TOTAL     : min=37.727, max=73.636, avg=53.338




   [Call #2 for step 17: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.5s):
   Grammar   : min=30.0, max=63.3, avg=48.2
   Coherence : min=42.5, max=93.3, avg=59.5
   Topic     : min=10.0, max=100.0, avg=64.3
   Quality   : min=40.0, max=83.3, avg=56.7
   Diversity : min=80.0, max=100.0, avg=94.2
   TOTAL     : min=40.568, max=62.879, avg=55.003




   [Call #3 for step 17: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 14/33 (42.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 8/33 (24.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.5s):
   Grammar   : min=0.0, max=90.0, avg=51.8
   Coherence : min=47.5, max=95.0, avg=70.6
   Topic     : min=63.3, max=100.0, avg=81.2
   Quality   : min=40.0, max=75.0, avg=57.8
   Diversity : min=44.0, max=100.0, avg=82.6
   TOTAL     : min=0.000, max=67.909, avg=46.717




   [Call #4 for step 17: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 16/33 (48.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 6/33 (18.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.8s):
   Grammar   : min=0.0, max=90.0, avg=53.9
   Coherence : min=42.0, max=85.0, avg=56.3
   Topic     : min=52.5, max=100.0, avg=82.6
   Quality   : min=32.5, max=75.0, avg=52.3
   Diversity : min=56.0, max=100.0, avg=76.3
   TOTAL     : min=0.000, max=76.477, avg=55.533




   [Call #5 for step 17: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (35.6s):
   Grammar   : min=30.0, max=90.0, avg=70.9
   Coherence : min=20.0, max=100.0, avg=55.2
   Topic     : min=30.0, max=100.0, avg=79.6
   Quality   : min=10.0, max=75.0, avg=46.5
   Diversity : min=30.0, max=100.0, avg=52.9
   TOTAL     : min=23.182, max=74.545, avg=43.709




   [Call #6 for step 17: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 17/36 (47.2%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 7/36 (19.4%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.8s):
   Grammar   : min=0.0, max=90.0, avg=56.7
   Coherence : min=40.0, max=96.0, avg=58.9
   Topic     : min=12.0, max=100.0, avg=56.7
   Quality   : min=28.0, max=66.7, avg=51.3
   Diversity : min=64.0, max=100.0, avg=86.7
   TOTAL     : min=37.636, max=73.333, avg=56.298





🔍 Step 17 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 18: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 12/30 (40.0%)
      🟢 Openai: 10/30 (33.3%)
      ❓ Deepseek: 8/30 (26.7%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.4s):
   Grammar   : min=0.0, max=70.0, avg=44.9
   Coherence : min=43.3, max=100.0, avg=72.5
   Topic     : min=20.0, max=96.0, avg=60.3
   Quality   : min=50.0, max=75.0, avg=63.3
   Diversity : min=80.0, max=100.0, avg=93.5
   TOTAL     : min=0.000, max=75.091, avg=49.279




   [Call #2 for step 18: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 16/33 (48.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 6/33 (18.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (71.8s):
   Grammar   : min=0.0, max=90.0, avg=51.9
   Coherence : min=45.0, max=100.0, avg=70.5
   Topic     : min=10.0, max=90.0, avg=49.7
   Quality   : min=38.3, max=66.7, avg=52.6
   Diversity : min=74.7, max=100.0, avg=91.8
   TOTAL     : min=0.000, max=83.727, avg=60.230




   [Call #3 for step 18: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.2s):
   Grammar   : min=0.0, max=90.0, avg=56.0
   Coherence : min=25.0, max=100.0, avg=68.1
   Topic     : min=13.3, max=100.0, avg=75.6
   Quality   : min=42.5, max=83.3, avg=64.4
   Diversity : min=70.0, max=100.0, avg=79.7
   TOTAL     : min=34.394, max=76.212, avg=55.306




   [Call #4 for step 18: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (50.6s):
   Grammar   : min=0.0, max=80.0, avg=50.0
   Coherence : min=35.0, max=85.0, avg=58.6
   Topic     : min=45.0, max=100.0, avg=86.5
   Quality   : min=58.8, max=75.0, avg=67.0
   Diversity : min=56.7, max=100.0, avg=89.8
   TOTAL     : min=61.818, max=76.818, avg=67.345




   [Call #5 for step 18: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.1s):
   Grammar   : min=22.5, max=80.0, avg=55.6
   Coherence : min=35.0, max=90.0, avg=67.6
   Topic     : min=70.0, max=97.5, avg=85.0
   Quality   : min=50.0, max=75.0, avg=63.6
   Diversity : min=80.0, max=100.0, avg=91.8
   TOTAL     : min=47.273, max=70.795, avg=57.083




   [Call #6 for step 18: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.0s):
   Grammar   : min=0.0, max=90.0, avg=49.8
   Coherence : min=30.0, max=66.0, avg=47.8
   Topic     : min=0.0, max=100.0, avg=52.5
   Quality   : min=20.0, max=75.0, avg=48.8
   Diversity : min=64.0, max=100.0, avg=83.3
   TOTAL     : min=46.818, max=70.909, avg=56.825





🔍 Step 18 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 19: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.0s):
   Grammar   : min=0.0, max=67.5, avg=43.8
   Coherence : min=43.3, max=100.0, avg=61.8
   Topic     : min=20.0, max=95.0, avg=55.9
   Quality   : min=40.0, max=70.0, avg=57.7
   Diversity : min=66.7, max=100.0, avg=88.8
   TOTAL     : min=30.303, max=74.318, avg=54.473




   [Call #2 for step 19: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 20/36 (55.6%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 4/36 (11.1%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (50.9s):
   Grammar   : min=20.0, max=90.0, avg=66.2
   Coherence : min=16.0, max=100.0, avg=67.0
   Topic     : min=54.0, max=96.0, avg=75.3
   Quality   : min=38.0, max=80.0, avg=60.0
   Diversity : min=64.0, max=100.0, avg=90.0
   TOTAL     : min=42.818, max=75.273, avg=56.220




   [Call #3 for step 19: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.9s):
   Grammar   : min=22.5, max=90.0, avg=57.6
   Coherence : min=50.0, max=100.0, avg=72.9
   Topic     : min=0.0, max=100.0, avg=52.8
   Quality   : min=36.7, max=75.0, avg=56.0
   Diversity : min=60.0, max=100.0, avg=83.6
   TOTAL     : min=25.909, max=79.886, avg=53.374




   [Call #4 for step 19: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.5s):
   Grammar   : min=35.0, max=90.0, avg=71.5
   Coherence : min=47.5, max=100.0, avg=78.5
   Topic     : min=0.0, max=100.0, avg=75.7
   Quality   : min=42.5, max=75.0, avg=60.3
   Diversity : min=70.0, max=100.0, avg=86.7
   TOTAL     : min=56.591, max=76.364, avg=70.467




   [Call #5 for step 19: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...




    ⏸️  deepseek:deepseek-chat → cooldown (2m)

   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 21/33 (63.6%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 1/33 (3.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.2s):
   Grammar   : min=0.0, max=72.0, avg=42.5
   Coherence : min=42.5, max=80.0, avg=61.8
   Topic     : min=30.0, max=92.0, avg=65.9
   Quality   : min=36.0, max=80.0, avg=62.0
   Diversity : min=76.0, max=100.0, avg=89.6
   TOTAL     : min=0.000, max=74.636, avg=49.665




   [Call #6 for step 19: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 24/36 (66.7%)
      🟢 Openai: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (20.4s):
   Grammar   : min=18.0, max=72.0, avg=57.4
   Coherence : min=30.0, max=100.0, avg=55.1
   Topic     : min=10.0, max=100.0, avg=75.1
   Quality   : min=30.0, max=75.0, avg=50.9
   Diversity : min=70.0, max=100.0, avg=85.2
   TOTAL     : min=35.909, max=78.545, avg=58.472





🔍 Step 19 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 20: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 20/30 (66.7%)
      🟢 Openai: 10/30 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (18.3s):
   Grammar   : min=18.0, max=67.5, avg=45.9
   Coherence : min=37.5, max=100.0, avg=68.0
   Topic     : min=0.0, max=100.0, avg=60.8
   Quality   : min=56.0, max=83.3, avg=71.4
   Diversity : min=70.0, max=100.0, avg=91.6
   TOTAL     : min=0.000, max=83.455, avg=55.203




   [Call #2 for step 20: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...




    🔄 deepseek:deepseek-chat → back online

   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 19/36 (52.8%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 5/36 (13.9%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.5s):
   Grammar   : min=0.0, max=90.0, avg=52.2
   Coherence : min=43.3, max=93.3, avg=63.2
   Topic     : min=47.5, max=96.0, avg=77.2
   Quality   : min=62.0, max=83.3, avg=74.6
   Diversity : min=70.0, max=100.0, avg=92.5
   TOTAL     : min=46.455, max=76.000, avg=65.747




   [Call #3 for step 20: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.8s):
   Grammar   : min=33.3, max=90.0, avg=53.9
   Coherence : min=43.3, max=96.0, avg=68.7
   Topic     : min=5.0, max=100.0, avg=73.1
   Quality   : min=45.0, max=83.3, avg=63.8
   Diversity : min=70.0, max=100.0, avg=91.5
   TOTAL     : min=47.273, max=84.318, avg=64.322




   [Call #4 for step 20: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.4s):
   Grammar   : min=0.0, max=80.0, avg=46.4
   Coherence : min=30.0, max=84.0, avg=58.4
   Topic     : min=46.7, max=100.0, avg=90.9
   Quality   : min=50.0, max=83.3, avg=63.4
   Diversity : min=30.0, max=100.0, avg=78.8
   TOTAL     : min=31.364, max=65.303, avg=47.775




   [Call #5 for step 20: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.1s):
   Grammar   : min=0.0, max=67.5, avg=43.5
   Coherence : min=33.3, max=100.0, avg=59.7
   Topic     : min=6.7, max=100.0, avg=69.4
   Quality   : min=50.0, max=75.0, avg=62.2
   Diversity : min=70.0, max=100.0, avg=84.2
   TOTAL     : min=40.909, max=65.606, avg=50.193




   [Call #6 for step 20: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.9s):
   Grammar   : min=45.0, max=90.0, avg=60.8
   Coherence : min=40.0, max=93.3, avg=58.9
   Topic     : min=10.0, max=100.0, avg=77.5
   Quality   : min=55.0, max=83.3, avg=73.5
   Diversity : min=60.0, max=100.0, avg=85.4
   TOTAL     : min=29.091, max=78.182, avg=52.431





🔍 Step 20 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 21: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.2s):
   Grammar   : min=0.0, max=70.0, avg=34.6
   Coherence : min=30.0, max=90.0, avg=59.9
   Topic     : min=0.0, max=100.0, avg=61.1
   Quality   : min=55.0, max=83.3, avg=68.8
   Diversity : min=70.0, max=100.0, avg=95.8
   TOTAL     : min=35.455, max=65.909, avg=51.534




   [Call #2 for step 21: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.3s):
   Grammar   : min=0.0, max=60.0, avg=38.7
   Coherence : min=20.0, max=66.7, avg=49.3
   Topic     : min=20.0, max=80.0, avg=55.2
   Quality   : min=53.3, max=83.3, avg=70.1
   Diversity : min=76.0, max=100.0, avg=95.3
   TOTAL     : min=43.485, max=73.636, avg=59.162




   [Call #3 for step 21: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.2s):
   Grammar   : min=0.0, max=90.0, avg=50.3
   Coherence : min=46.7, max=100.0, avg=77.7
   Topic     : min=13.3, max=96.7, avg=54.7
   Quality   : min=56.7, max=87.5, avg=73.7
   Diversity : min=70.0, max=100.0, avg=90.4
   TOTAL     : min=36.364, max=76.818, avg=60.735




   [Call #4 for step 21: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.6s):
   Grammar   : min=0.0, max=70.0, avg=37.9
   Coherence : min=52.5, max=100.0, avg=73.1
   Topic     : min=12.0, max=100.0, avg=61.0
   Quality   : min=55.0, max=76.2, avg=67.4
   Diversity : min=80.0, max=100.0, avg=95.0
   TOTAL     : min=42.727, max=79.886, avg=64.021




   [Call #5 for step 21: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.0s):
   Grammar   : min=25.0, max=90.0, avg=72.8
   Coherence : min=30.0, max=93.3, avg=63.5
   Topic     : min=33.3, max=100.0, avg=56.5
   Quality   : min=50.0, max=83.3, avg=68.2
   Diversity : min=30.0, max=100.0, avg=81.2
   TOTAL     : min=30.000, max=76.061, avg=57.604




   [Call #6 for step 21: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.1s):
   Grammar   : min=16.7, max=76.7, avg=54.6
   Coherence : min=43.3, max=80.0, avg=57.2
   Topic     : min=20.0, max=100.0, avg=68.1
   Quality   : min=56.7, max=83.3, avg=71.3
   Diversity : min=80.0, max=100.0, avg=87.6
   TOTAL     : min=50.606, max=76.273, avg=63.260





🔍 Step 21 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 22: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.2s):
   Grammar   : min=30.0, max=90.0, avg=58.7
   Coherence : min=38.0, max=95.0, avg=67.8
   Topic     : min=76.0, max=100.0, avg=90.8
   Quality   : min=36.0, max=75.0, avg=58.3
   Diversity : min=64.0, max=100.0, avg=92.2
   TOTAL     : min=46.061, max=69.205, avg=57.575




   [Call #2 for step 22: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.4s):
   Grammar   : min=0.0, max=76.7, avg=52.6
   Coherence : min=37.5, max=100.0, avg=69.6
   Topic     : min=43.3, max=100.0, avg=80.8
   Quality   : min=45.0, max=87.5, avg=70.2
   Diversity : min=80.0, max=100.0, avg=95.3
   TOTAL     : min=60.000, max=84.455, avg=69.757




   [Call #3 for step 22: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 12/33 (36.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 10/33 (30.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.8s):
   Grammar   : min=0.0, max=70.0, avg=39.5
   Coherence : min=55.0, max=100.0, avg=78.5
   Topic     : min=65.0, max=100.0, avg=87.2
   Quality   : min=45.0, max=79.0, avg=69.4
   Diversity : min=70.0, max=100.0, avg=86.6
   TOTAL     : min=0.000, max=88.727, avg=56.693




   [Call #4 for step 22: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (49.6s):
   Grammar   : min=0.0, max=74.0, avg=31.5
   Coherence : min=42.0, max=80.0, avg=60.8
   Topic     : min=20.0, max=100.0, avg=74.5
   Quality   : min=44.0, max=83.3, avg=65.3
   Diversity : min=64.0, max=100.0, avg=92.5
   TOTAL     : min=43.333, max=71.909, avg=54.472




   [Call #5 for step 22: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.9s):
   Grammar   : min=16.7, max=80.0, avg=55.6
   Coherence : min=46.7, max=95.0, avg=63.4
   Topic     : min=20.0, max=100.0, avg=71.6
   Quality   : min=32.5, max=75.0, avg=62.4
   Diversity : min=85.0, max=100.0, avg=96.2
   TOTAL     : min=53.409, max=78.977, avg=62.352




   [Call #6 for step 22: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.3s):
   Grammar   : min=0.0, max=90.0, avg=50.8
   Coherence : min=30.0, max=100.0, avg=69.9
   Topic     : min=70.0, max=100.0, avg=91.5
   Quality   : min=28.0, max=83.3, avg=58.1
   Diversity : min=80.0, max=100.0, avg=95.0
   TOTAL     : min=42.500, max=81.273, avg=61.189





🔍 Step 22 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 23: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 18/36 (50.0%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 6/36 (16.7%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (48.8s):
   Grammar   : min=0.0, max=90.0, avg=50.9
   Coherence : min=40.0, max=100.0, avg=67.6
   Topic     : min=34.0, max=92.5, avg=75.5
   Quality   : min=56.7, max=83.3, avg=74.5
   Diversity : min=64.0, max=100.0, avg=91.4
   TOTAL     : min=49.636, max=69.318, avg=59.198




   [Call #2 for step 23: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 16/33 (48.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 6/33 (18.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.0s):
   Grammar   : min=0.0, max=76.7, avg=44.4
   Coherence : min=54.0, max=96.0, avg=75.0
   Topic     : min=13.3, max=92.0, avg=42.2
   Quality   : min=51.7, max=83.3, avg=71.1
   Diversity : min=76.0, max=100.0, avg=97.8
   TOTAL     : min=0.000, max=78.727, avg=55.750




   [Call #3 for step 23: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 17/33 (51.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 5/33 (15.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.8s):
   Grammar   : min=0.0, max=72.0, avg=39.8
   Coherence : min=40.0, max=100.0, avg=70.2
   Topic     : min=20.0, max=96.0, avg=65.8
   Quality   : min=62.0, max=80.0, avg=71.4
   Diversity : min=64.0, max=100.0, avg=86.9
   TOTAL     : min=0.000, max=81.273, avg=60.652




   [Call #4 for step 23: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.9s):
   Grammar   : min=18.0, max=90.0, avg=53.6
   Coherence : min=30.0, max=96.0, avg=67.8
   Topic     : min=30.0, max=100.0, avg=72.1
   Quality   : min=55.0, max=100.0, avg=75.6
   Diversity : min=64.0, max=100.0, avg=89.0
   TOTAL     : min=40.227, max=70.303, avg=56.619




   [Call #5 for step 23: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 17/36 (47.2%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 7/36 (19.4%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (48.2s):
   Grammar   : min=18.0, max=90.0, avg=60.8
   Coherence : min=46.0, max=100.0, avg=72.2
   Topic     : min=40.0, max=100.0, avg=76.9
   Quality   : min=45.0, max=83.3, avg=71.1
   Diversity : min=70.0, max=100.0, avg=93.2
   TOTAL     : min=45.303, max=86.455, avg=67.610




   [Call #6 for step 23: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 12/33 (36.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 10/33 (30.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.9s):
   Grammar   : min=16.7, max=90.0, avg=45.1
   Coherence : min=40.0, max=82.5, avg=64.4
   Topic     : min=30.0, max=96.7, avg=81.9
   Quality   : min=52.5, max=83.3, avg=72.4
   Diversity : min=85.0, max=100.0, avg=98.6
   TOTAL     : min=0.000, max=79.091, avg=53.328





🔍 Step 23 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 24: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.7s):
   Grammar   : min=22.5, max=90.0, avg=70.3
   Coherence : min=30.0, max=90.0, avg=65.6
   Topic     : min=20.0, max=100.0, avg=78.5
   Quality   : min=10.0, max=87.5, avg=67.0
   Diversity : min=30.0, max=100.0, avg=78.3
   TOTAL     : min=25.455, max=79.432, avg=59.283




   [Call #2 for step 24: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.1s):
   Grammar   : min=0.0, max=70.0, avg=43.3
   Coherence : min=47.5, max=100.0, avg=72.8
   Topic     : min=15.0, max=100.0, avg=53.3
   Quality   : min=55.0, max=75.0, avg=70.3
   Diversity : min=64.0, max=100.0, avg=93.2
   TOTAL     : min=44.364, max=79.432, avg=63.091




   [Call #3 for step 24: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.1s):
   Grammar   : min=0.0, max=60.0, avg=37.7
   Coherence : min=35.0, max=100.0, avg=62.3
   Topic     : min=30.0, max=100.0, avg=77.4
   Quality   : min=65.0, max=100.0, avg=81.1
   Diversity : min=80.0, max=100.0, avg=92.1
   TOTAL     : min=36.364, max=69.545, avg=55.161




   [Call #4 for step 24: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.8s):
   Grammar   : min=22.5, max=90.0, avg=62.4
   Coherence : min=40.0, max=100.0, avg=61.4
   Topic     : min=8.0, max=100.0, avg=80.6
   Quality   : min=52.0, max=83.3, avg=69.5
   Diversity : min=80.0, max=100.0, avg=91.2
   TOTAL     : min=44.318, max=79.545, avg=62.283




   [Call #5 for step 24: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 14/33 (42.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 8/33 (24.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.3s):
   Grammar   : min=0.0, max=90.0, avg=53.3
   Coherence : min=42.5, max=100.0, avg=69.6
   Topic     : min=30.0, max=100.0, avg=83.9
   Quality   : min=32.5, max=83.3, avg=68.3
   Diversity : min=76.0, max=100.0, avg=94.3
   TOTAL     : min=0.000, max=67.273, avg=52.378




   [Call #6 for step 24: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.8s):
   Grammar   : min=0.0, max=82.0, avg=50.5
   Coherence : min=45.0, max=100.0, avg=76.9
   Topic     : min=52.5, max=100.0, avg=83.8
   Quality   : min=55.0, max=80.0, avg=71.9
   Diversity : min=60.0, max=100.0, avg=80.8
   TOTAL     : min=33.409, max=73.455, avg=55.445


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



📊 Running validation at step 25...

  Generating for validation sample 1/10...

🔍 Step 24 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 2/10...




   [Call #2 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 3/10...




   [Call #3 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 4/10...




   [Call #4 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 5/10...




   [Call #5 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 6/10...




   [Call #6 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 7/10...




   [Call #7 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 8/10...




   [Call #8 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 9/10...




   [Call #9 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 10/10...




   [Call #10 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

📊 Validation Results at Step 25:
   Average Reward: 0.00
   Max Reward:     0.00
   Min Reward:     0.00
   Saved to: models/TeacherPet_italian_grpo/validation_results/validation_step_25.json






🔍 Step 0 TOTAL: 10 reward calls, 10 completions scored
   [Call #1 for step 25: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (50.9s):
   Grammar   : min=0.0, max=80.0, avg=41.0
   Coherence : min=34.0, max=73.3, avg=55.3
   Topic     : min=32.0, max=100.0, avg=74.2
   Quality   : min=56.7, max=83.3, avg=70.6
   Diversity : min=76.0, max=100.0, avg=91.2
   TOTAL     : min=45.758, max=72.576, avg=55.254




   [Call #2 for step 25: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 17/33 (51.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 5/33 (15.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (49.4s):
   Grammar   : min=0.0, max=90.0, avg=42.5
   Coherence : min=42.5, max=84.0, avg=61.9
   Topic     : min=15.0, max=94.0, avg=48.6
   Quality   : min=46.0, max=80.0, avg=70.0
   Diversity : min=70.0, max=100.0, avg=89.9
   TOTAL     : min=0.000, max=79.636, avg=59.055




   [Call #3 for step 25: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (50.8s):
   Grammar   : min=30.0, max=90.0, avg=58.3
   Coherence : min=30.0, max=100.0, avg=67.2
   Topic     : min=5.0, max=100.0, avg=74.7
   Quality   : min=50.0, max=87.5, avg=75.4
   Diversity : min=85.0, max=100.0, avg=95.5
   TOTAL     : min=56.515, max=81.727, avg=66.512




   [Call #4 for step 25: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.8s):
   Grammar   : min=0.0, max=90.0, avg=43.7
   Coherence : min=20.0, max=100.0, avg=54.0
   Topic     : min=10.0, max=100.0, avg=56.0
   Quality   : min=10.0, max=83.3, avg=53.5
   Diversity : min=30.0, max=100.0, avg=64.2
   TOTAL     : min=20.909, max=55.682, avg=40.451




   [Call #5 for step 25: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...




    ⏸️  deepseek:deepseek-chat → cooldown (2m)

   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 23/36 (63.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 1/36 (2.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.9s):
   Grammar   : min=0.0, max=80.0, avg=35.0
   Coherence : min=40.0, max=95.0, avg=66.9
   Topic     : min=12.0, max=98.0, avg=56.6
   Quality   : min=62.5, max=80.0, avg=73.2
   Diversity : min=70.0, max=100.0, avg=92.2
   TOTAL     : min=35.000, max=84.727, avg=64.072




   [Call #6 for step 25: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 24/36 (66.7%)
      🟢 Openai: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (17.7s):
   Grammar   : min=0.0, max=90.0, avg=54.8
   Coherence : min=50.0, max=100.0, avg=69.9
   Topic     : min=13.3, max=100.0, avg=61.8
   Quality   : min=50.0, max=87.5, avg=70.4
   Diversity : min=88.0, max=100.0, avg=99.0
   TOTAL     : min=41.970, max=80.682, avg=66.317





🔍 Step 25 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 26: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 24/36 (66.7%)
      🟢 Openai: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (18.3s):
   Grammar   : min=0.0, max=90.0, avg=58.3
   Coherence : min=23.3, max=100.0, avg=63.9
   Topic     : min=0.0, max=97.5, avg=50.9
   Quality   : min=62.0, max=83.3, avg=72.3
   Diversity : min=76.0, max=100.0, avg=88.8
   TOTAL     : min=54.455, max=74.697, avg=65.916




   [Call #2 for step 26: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...




    🔄 deepseek:deepseek-chat → back online

   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 23/36 (63.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 1/36 (2.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (32.3s):
   Grammar   : min=22.5, max=90.0, avg=59.6
   Coherence : min=37.5, max=85.0, avg=61.2
   Topic     : min=20.0, max=95.0, avg=52.0
   Quality   : min=52.5, max=87.5, avg=71.8
   Diversity : min=88.0, max=100.0, avg=98.0
   TOTAL     : min=46.970, max=83.523, avg=64.611




   [Call #3 for step 26: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.0s):
   Grammar   : min=45.0, max=80.0, avg=53.3
   Coherence : min=40.0, max=100.0, avg=71.3
   Topic     : min=27.5, max=100.0, avg=69.8
   Quality   : min=30.0, max=75.0, avg=64.1
   Diversity : min=64.0, max=100.0, avg=84.2
   TOTAL     : min=40.909, max=69.818, avg=54.998




   [Call #4 for step 26: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 17/36 (47.2%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 7/36 (19.4%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.4s):
   Grammar   : min=20.0, max=90.0, avg=52.8
   Coherence : min=36.0, max=100.0, avg=61.9
   Topic     : min=64.0, max=100.0, avg=92.4
   Quality   : min=50.0, max=83.3, avg=70.0
   Diversity : min=70.0, max=100.0, avg=89.5
   TOTAL     : min=49.091, max=86.091, avg=63.917




   [Call #5 for step 26: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.7s):
   Grammar   : min=36.0, max=90.0, avg=69.6
   Coherence : min=58.0, max=100.0, avg=74.0
   Topic     : min=10.0, max=55.0, avg=26.9
   Quality   : min=46.0, max=87.5, avg=70.5
   Diversity : min=88.0, max=100.0, avg=97.0
   TOTAL     : min=37.182, max=71.545, avg=56.597




   [Call #6 for step 26: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.7s):
   Grammar   : min=0.0, max=90.0, avg=48.7
   Coherence : min=47.5, max=84.0, avg=61.1
   Topic     : min=13.3, max=100.0, avg=70.7
   Quality   : min=60.0, max=87.5, avg=77.3
   Diversity : min=85.0, max=100.0, avg=96.2
   TOTAL     : min=56.023, max=78.727, avg=67.218





🔍 Step 26 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 27: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.9s):
   Grammar   : min=30.0, max=90.0, avg=53.7
   Coherence : min=30.0, max=96.0, avg=56.0
   Topic     : min=20.0, max=96.7, avg=57.2
   Quality   : min=56.7, max=83.3, avg=70.7
   Diversity : min=80.0, max=100.0, avg=90.4
   TOTAL     : min=43.182, max=74.909, avg=60.868




   [Call #2 for step 27: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 18/36 (50.0%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 6/36 (16.7%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.7s):
   Grammar   : min=36.0, max=90.0, avg=56.6
   Coherence : min=58.0, max=90.0, avg=72.3
   Topic     : min=8.0, max=100.0, avg=53.2
   Quality   : min=50.0, max=80.0, avg=68.8
   Diversity : min=76.0, max=100.0, avg=88.0
   TOTAL     : min=34.773, max=80.545, avg=59.042




   [Call #3 for step 27: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 16/33 (48.5%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 6/33 (18.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.2s):
   Grammar   : min=0.0, max=76.7, avg=51.8
   Coherence : min=52.0, max=96.0, avg=74.1
   Topic     : min=4.0, max=100.0, avg=62.9
   Quality   : min=62.5, max=83.3, avg=74.7
   Diversity : min=76.0, max=100.0, avg=92.1
   TOTAL     : min=0.000, max=80.606, avg=55.688




   [Call #4 for step 27: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 13/33 (39.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 9/33 (27.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (32.9s):
   Grammar   : min=0.0, max=90.0, avg=48.9
   Coherence : min=20.0, max=100.0, avg=70.8
   Topic     : min=60.0, max=100.0, avg=85.3
   Quality   : min=50.0, max=100.0, avg=70.3
   Diversity : min=30.0, max=100.0, avg=73.0
   TOTAL     : min=0.000, max=65.909, avg=44.261




   [Call #5 for step 27: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 18/36 (50.0%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 6/36 (16.7%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.4s):
   Grammar   : min=0.0, max=74.0, avg=50.4
   Coherence : min=47.5, max=100.0, avg=69.8
   Topic     : min=8.0, max=100.0, avg=55.0
   Quality   : min=52.5, max=80.0, avg=73.9
   Diversity : min=85.0, max=100.0, avg=96.7
   TOTAL     : min=60.909, max=85.545, avg=73.884




   [Call #6 for step 27: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 17/36 (47.2%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 7/36 (19.4%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.3s):
   Grammar   : min=36.0, max=90.0, avg=56.4
   Coherence : min=47.5, max=100.0, avg=74.3
   Topic     : min=25.0, max=100.0, avg=65.5
   Quality   : min=60.0, max=87.5, avg=74.7
   Diversity : min=60.0, max=100.0, avg=85.5
   TOTAL     : min=38.182, max=85.727, avg=64.727





🔍 Step 27 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 28: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.1s):
   Grammar   : min=0.0, max=90.0, avg=45.8
   Coherence : min=30.0, max=95.0, avg=72.3
   Topic     : min=20.0, max=100.0, avg=83.7
   Quality   : min=70.0, max=87.5, avg=79.2
   Diversity : min=85.0, max=100.0, avg=98.8
   TOTAL     : min=45.758, max=77.841, avg=62.070




   [Call #2 for step 28: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.6s):
   Grammar   : min=45.0, max=95.0, avg=67.0
   Coherence : min=30.0, max=100.0, avg=72.5
   Topic     : min=47.5, max=100.0, avg=82.6
   Quality   : min=70.0, max=87.5, avg=79.3
   Diversity : min=85.0, max=100.0, avg=98.8
   TOTAL     : min=61.136, max=78.977, avg=69.081




   [Call #3 for step 28: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.5s):
   Grammar   : min=25.0, max=90.0, avg=51.7
   Coherence : min=50.0, max=95.0, avg=72.4
   Topic     : min=52.5, max=100.0, avg=77.2
   Quality   : min=50.0, max=87.5, avg=81.5
   Diversity : min=80.0, max=100.0, avg=97.1
   TOTAL     : min=45.000, max=78.864, avg=65.009




   [Call #4 for step 28: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      🟢 Openai: 10/30 (33.3%)
      ⚡ Groq: 10/30 (33.3%)
      ❓ Deepseek: 10/30 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (34.4s):
   Grammar   : min=0.0, max=57.5, avg=30.3
   Coherence : min=54.0, max=100.0, avg=76.2
   Topic     : min=80.0, max=100.0, avg=93.2
   Quality   : min=42.5, max=87.5, avg=73.7
   Diversity : min=76.0, max=100.0, avg=94.6
   TOTAL     : min=0.000, max=73.182, avg=52.831




   [Call #5 for step 28: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.3s):
   Grammar   : min=0.0, max=100.0, avg=41.9
   Coherence : min=23.3, max=90.0, avg=59.2
   Topic     : min=6.7, max=90.0, avg=41.7
   Quality   : min=56.7, max=100.0, avg=78.1
   Diversity : min=70.0, max=100.0, avg=92.9
   TOTAL     : min=33.788, max=70.455, avg=50.805




   [Call #6 for step 28: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (52.4s):
   Grammar   : min=0.0, max=100.0, avg=49.8
   Coherence : min=37.5, max=80.0, avg=59.1
   Topic     : min=43.3, max=98.0, avg=65.9
   Quality   : min=54.0, max=90.0, avg=74.8
   Diversity : min=70.0, max=100.0, avg=92.2
   TOTAL     : min=50.273, max=75.000, avg=64.678





🔍 Step 28 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 29: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (48.7s):
   Grammar   : min=0.0, max=67.5, avg=35.8
   Coherence : min=40.0, max=100.0, avg=64.1
   Topic     : min=12.5, max=100.0, avg=76.3
   Quality   : min=52.5, max=87.5, avg=77.2
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=41.000, max=80.455, avg=62.447




   [Call #2 for step 29: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.8s):
   Grammar   : min=0.0, max=90.0, avg=37.9
   Coherence : min=40.0, max=100.0, avg=69.3
   Topic     : min=70.0, max=100.0, avg=91.4
   Quality   : min=55.0, max=90.0, avg=78.8
   Diversity : min=60.0, max=100.0, avg=86.5
   TOTAL     : min=35.455, max=83.295, avg=57.180




   [Call #3 for step 29: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.0s):
   Grammar   : min=0.0, max=90.0, avg=55.3
   Coherence : min=47.5, max=100.0, avg=78.8
   Topic     : min=48.0, max=100.0, avg=82.0
   Quality   : min=50.0, max=100.0, avg=71.3
   Diversity : min=30.0, max=100.0, avg=82.1
   TOTAL     : min=37.273, max=79.091, avg=59.616




   [Call #4 for step 29: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.7s):
   Grammar   : min=30.0, max=90.0, avg=57.9
   Coherence : min=14.0, max=100.0, avg=71.8
   Topic     : min=10.0, max=100.0, avg=80.5
   Quality   : min=55.0, max=80.0, avg=74.5
   Diversity : min=60.0, max=100.0, avg=86.0
   TOTAL     : min=35.909, max=75.455, avg=60.542




   [Call #5 for step 29: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.7s):
   Grammar   : min=0.0, max=100.0, avg=45.5
   Coherence : min=60.0, max=100.0, avg=80.6
   Topic     : min=10.0, max=100.0, avg=71.9
   Quality   : min=55.0, max=87.5, avg=77.1
   Diversity : min=80.0, max=100.0, avg=93.8
   TOTAL     : min=39.091, max=85.000, avg=63.724




   [Call #6 for step 29: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.1s):
   Grammar   : min=0.0, max=90.0, avg=39.2
   Coherence : min=47.5, max=83.3, avg=68.2
   Topic     : min=46.7, max=100.0, avg=80.7
   Quality   : min=56.7, max=87.5, avg=76.5
   Diversity : min=76.0, max=100.0, avg=94.1
   TOTAL     : min=50.606, max=76.364, avg=62.972





🔍 Step 29 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 30: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (49.6s):
   Grammar   : min=0.0, max=72.0, avg=43.6
   Coherence : min=62.5, max=96.0, avg=77.0
   Topic     : min=46.0, max=100.0, avg=78.0
   Quality   : min=51.2, max=90.0, avg=65.2
   Diversity : min=85.0, max=100.0, avg=95.5
   TOTAL     : min=42.182, max=73.182, avg=61.695




   [Call #2 for step 30: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 21/36 (58.3%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 3/36 (8.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (55.3s):
   Grammar   : min=0.0, max=82.0, avg=44.8
   Coherence : min=16.0, max=100.0, avg=70.2
   Topic     : min=0.0, max=100.0, avg=47.5
   Quality   : min=61.0, max=90.0, avg=77.0
   Diversity : min=88.0, max=100.0, avg=96.0
   TOTAL     : min=45.091, max=74.091, avg=62.773




   [Call #3 for step 30: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 17/36 (47.2%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 7/36 (19.4%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.1s):
   Grammar   : min=56.0, max=90.0, avg=79.5
   Coherence : min=58.0, max=100.0, avg=79.3
   Topic     : min=20.0, max=100.0, avg=68.7
   Quality   : min=10.0, max=100.0, avg=64.5
   Diversity : min=30.0, max=100.0, avg=62.0
   TOTAL     : min=25.455, max=78.091, avg=52.765




   [Call #4 for step 30: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 18/36 (50.0%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 6/36 (16.7%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (49.5s):
   Grammar   : min=0.0, max=67.5, avg=40.4
   Coherence : min=50.0, max=100.0, avg=74.4
   Topic     : min=16.0, max=100.0, avg=49.4
   Quality   : min=60.0, max=90.0, avg=78.8
   Diversity : min=88.0, max=100.0, avg=98.0
   TOTAL     : min=52.386, max=77.636, avg=65.938




   [Call #5 for step 30: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 18/36 (50.0%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 6/36 (16.7%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (51.7s):
   Grammar   : min=0.0, max=74.0, avg=48.8
   Coherence : min=54.0, max=100.0, avg=75.9
   Topic     : min=26.0, max=100.0, avg=62.8
   Quality   : min=56.0, max=87.5, avg=71.6
   Diversity : min=76.0, max=100.0, avg=94.0
   TOTAL     : min=45.727, max=76.909, avg=65.752




   [Call #6 for step 30: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.1s):
   Grammar   : min=0.0, max=100.0, avg=44.6
   Coherence : min=47.5, max=85.0, avg=64.8
   Topic     : min=20.0, max=100.0, avg=76.2
   Quality   : min=75.0, max=100.0, avg=84.8
   Diversity : min=80.0, max=100.0, avg=95.8
   TOTAL     : min=40.000, max=81.364, avg=60.682





🔍 Step 30 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 31: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.8s):
   Grammar   : min=22.5, max=100.0, avg=63.6
   Coherence : min=53.3, max=100.0, avg=78.8
   Topic     : min=55.0, max=100.0, avg=87.3
   Quality   : min=52.5, max=90.0, avg=75.5
   Diversity : min=85.0, max=100.0, avg=98.8
   TOTAL     : min=50.682, max=80.364, avg=67.599




   [Call #2 for step 31: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.1s):
   Grammar   : min=0.0, max=90.0, avg=59.6
   Coherence : min=56.7, max=100.0, avg=78.5
   Topic     : min=35.0, max=100.0, avg=64.7
   Quality   : min=62.0, max=100.0, avg=85.2
   Diversity : min=80.0, max=100.0, avg=97.3
   TOTAL     : min=53.182, max=80.114, avg=63.961




   [Call #3 for step 31: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.9s):
   Grammar   : min=56.0, max=90.0, avg=70.0
   Coherence : min=42.5, max=96.0, avg=75.2
   Topic     : min=40.0, max=100.0, avg=79.7
   Quality   : min=52.5, max=90.0, avg=77.8
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=50.795, max=87.000, avg=73.979




   [Call #4 for step 31: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 12/33 (36.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 10/33 (30.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (34.5s):
   Grammar   : min=0.0, max=100.0, avg=67.5
   Coherence : min=58.0, max=100.0, avg=73.9
   Topic     : min=20.0, max=100.0, avg=66.3
   Quality   : min=72.0, max=87.5, avg=77.4
   Diversity : min=76.0, max=100.0, avg=96.5
   TOTAL     : min=0.000, max=76.273, avg=58.521




   [Call #5 for step 31: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 17/36 (47.2%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 7/36 (19.4%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.9s):
   Grammar   : min=0.0, max=82.0, avg=49.1
   Coherence : min=40.0, max=85.0, avg=67.3
   Topic     : min=16.0, max=100.0, avg=68.9
   Quality   : min=60.0, max=82.0, avg=73.8
   Diversity : min=70.0, max=100.0, avg=89.0
   TOTAL     : min=47.727, max=77.273, avg=59.903




   [Call #6 for step 31: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.0s):
   Grammar   : min=45.0, max=82.0, avg=57.0
   Coherence : min=47.5, max=100.0, avg=81.5
   Topic     : min=25.0, max=100.0, avg=66.4
   Quality   : min=52.5, max=90.0, avg=80.5
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=42.841, max=83.295, avg=64.229





🔍 Step 31 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 32: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.9s):
   Grammar   : min=0.0, max=90.0, avg=44.4
   Coherence : min=53.3, max=100.0, avg=71.6
   Topic     : min=36.7, max=100.0, avg=80.8
   Quality   : min=70.0, max=87.5, avg=82.4
   Diversity : min=80.0, max=100.0, avg=95.8
   TOTAL     : min=52.614, max=74.318, avg=63.548




   [Call #2 for step 32: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 12/33 (36.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 10/33 (30.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.2s):
   Grammar   : min=0.0, max=100.0, avg=46.6
   Coherence : min=50.0, max=100.0, avg=80.3
   Topic     : min=26.0, max=95.0, avg=63.4
   Quality   : min=66.7, max=90.0, avg=80.3
   Diversity : min=80.0, max=100.0, avg=96.8
   TOTAL     : min=0.000, max=83.030, avg=60.471




   [Call #3 for step 32: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 13/33 (39.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 9/33 (27.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.0s):
   Grammar   : min=0.0, max=90.0, avg=47.0
   Coherence : min=46.7, max=96.0, avg=72.9
   Topic     : min=46.7, max=100.0, avg=86.1
   Quality   : min=53.3, max=90.0, avg=74.8
   Diversity : min=88.0, max=100.0, avg=98.9
   TOTAL     : min=0.000, max=78.636, avg=59.169




   [Call #4 for step 32: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.5s):
   Grammar   : min=0.0, max=90.0, avg=56.5
   Coherence : min=56.0, max=100.0, avg=74.9
   Topic     : min=24.0, max=100.0, avg=69.0
   Quality   : min=60.0, max=90.0, avg=76.5
   Diversity : min=85.0, max=100.0, avg=97.8
   TOTAL     : min=43.364, max=82.636, avg=60.280




   [Call #5 for step 32: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.9s):
   Grammar   : min=0.0, max=90.0, avg=42.4
   Coherence : min=34.0, max=100.0, avg=65.2
   Topic     : min=10.0, max=100.0, avg=69.8
   Quality   : min=30.0, max=100.0, avg=66.8
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=38.068, max=72.159, avg=56.542




   [Call #6 for step 32: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (35.5s):
   Grammar   : min=0.0, max=76.7, avg=32.5
   Coherence : min=30.0, max=100.0, avg=59.7
   Topic     : min=13.3, max=100.0, avg=66.1
   Quality   : min=10.0, max=100.0, avg=73.0
   Diversity : min=30.0, max=100.0, avg=80.8
   TOTAL     : min=20.000, max=70.909, avg=58.245





🔍 Step 32 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 33: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.2s):
   Grammar   : min=0.0, max=67.5, avg=36.0
   Coherence : min=62.5, max=100.0, avg=81.8
   Topic     : min=20.0, max=100.0, avg=72.2
   Quality   : min=52.5, max=100.0, avg=78.1
   Diversity : min=60.0, max=100.0, avg=91.1
   TOTAL     : min=40.727, max=63.977, avg=51.284




   [Call #2 for step 33: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.1s):
   Grammar   : min=0.0, max=90.0, avg=53.1
   Coherence : min=72.0, max=100.0, avg=88.4
   Topic     : min=24.0, max=100.0, avg=81.2
   Quality   : min=66.7, max=100.0, avg=85.2
   Diversity : min=70.0, max=100.0, avg=92.5
   TOTAL     : min=46.455, max=83.636, avg=62.987




   [Call #3 for step 33: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (31.2s):
   Grammar   : min=0.0, max=100.0, avg=40.9
   Coherence : min=53.3, max=100.0, avg=79.8
   Topic     : min=27.5, max=100.0, avg=78.7
   Quality   : min=50.0, max=100.0, avg=68.9
   Diversity : min=30.0, max=100.0, avg=80.8
   TOTAL     : min=33.636, max=75.455, avg=55.600




   [Call #4 for step 33: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.1s):
   Grammar   : min=0.0, max=100.0, avg=48.6
   Coherence : min=32.5, max=93.3, avg=65.4
   Topic     : min=50.0, max=100.0, avg=73.3
   Quality   : min=70.0, max=89.0, avg=81.0
   Diversity : min=80.0, max=100.0, avg=98.3
   TOTAL     : min=51.061, max=82.273, avg=61.972




   [Call #5 for step 33: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.9s):
   Grammar   : min=0.0, max=100.0, avg=55.3
   Coherence : min=50.0, max=100.0, avg=75.2
   Topic     : min=56.7, max=100.0, avg=89.2
   Quality   : min=60.0, max=83.3, avg=73.6
   Diversity : min=80.0, max=100.0, avg=97.3
   TOTAL     : min=44.242, max=81.909, avg=61.907




   [Call #6 for step 33: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.1s):
   Grammar   : min=0.0, max=100.0, avg=58.6
   Coherence : min=53.3, max=98.0, avg=70.8
   Topic     : min=44.0, max=100.0, avg=81.0
   Quality   : min=80.0, max=87.5, avg=84.6
   Diversity : min=85.0, max=100.0, avg=97.8
   TOTAL     : min=42.909, max=81.667, avg=67.958





🔍 Step 33 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 34: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.4s):
   Grammar   : min=0.0, max=90.0, avg=60.4
   Coherence : min=60.0, max=100.0, avg=78.6
   Topic     : min=20.0, max=100.0, avg=85.0
   Quality   : min=30.0, max=100.0, avg=73.0
   Diversity : min=80.0, max=100.0, avg=94.0
   TOTAL     : min=42.727, max=72.545, avg=58.473




   [Call #2 for step 34: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.0s):
   Grammar   : min=16.7, max=100.0, avg=71.5
   Coherence : min=46.7, max=88.0, avg=65.9
   Topic     : min=0.0, max=100.0, avg=47.8
   Quality   : min=56.7, max=90.0, avg=79.9
   Diversity : min=80.0, max=100.0, avg=97.3
   TOTAL     : min=46.591, max=74.394, avg=61.308




   [Call #3 for step 34: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.5s):
   Grammar   : min=0.0, max=100.0, avg=47.1
   Coherence : min=55.0, max=95.0, avg=79.8
   Topic     : min=20.0, max=100.0, avg=59.5
   Quality   : min=66.7, max=90.0, avg=77.9
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=46.667, max=89.091, avg=65.779




   [Call #4 for step 34: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.9s):
   Grammar   : min=0.0, max=90.0, avg=45.7
   Coherence : min=23.3, max=96.0, avg=72.9
   Topic     : min=0.0, max=100.0, avg=62.8
   Quality   : min=56.7, max=90.0, avg=77.7
   Diversity : min=80.0, max=100.0, avg=97.1
   TOTAL     : min=36.818, max=87.455, avg=64.812




   [Call #5 for step 34: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (33.8s):
   Grammar   : min=0.0, max=80.0, avg=38.4
   Coherence : min=43.3, max=95.0, avg=64.4
   Topic     : min=26.7, max=100.0, avg=79.8
   Quality   : min=55.0, max=100.0, avg=81.6
   Diversity : min=60.0, max=100.0, avg=88.3
   TOTAL     : min=27.955, max=72.273, avg=53.229




   [Call #6 for step 34: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.7s):
   Grammar   : min=33.3, max=100.0, avg=56.3
   Coherence : min=50.0, max=94.0, avg=70.1
   Topic     : min=43.3, max=100.0, avg=86.0
   Quality   : min=75.0, max=100.0, avg=85.0
   Diversity : min=60.0, max=100.0, avg=95.7
   TOTAL     : min=41.136, max=73.333, avg=61.492





🔍 Step 34 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 35: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.0s):
   Grammar   : min=0.0, max=100.0, avg=49.7
   Coherence : min=50.0, max=100.0, avg=74.0
   Topic     : min=53.3, max=100.0, avg=79.9
   Quality   : min=66.7, max=87.5, avg=80.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=44.545, max=74.091, avg=59.438




   [Call #2 for step 35: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.1s):
   Grammar   : min=0.0, max=90.0, avg=54.7
   Coherence : min=50.0, max=97.5, avg=72.6
   Topic     : min=20.0, max=100.0, avg=71.1
   Quality   : min=70.0, max=100.0, avg=80.6
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=50.152, max=83.636, avg=66.184




   [Call #3 for step 35: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.6s):
   Grammar   : min=0.0, max=100.0, avg=50.0
   Coherence : min=56.7, max=95.0, avg=76.7
   Topic     : min=46.7, max=100.0, avg=81.5
   Quality   : min=66.7, max=87.5, avg=80.5
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=58.455, max=79.848, avg=67.799




   [Call #4 for step 35: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.0s):
   Grammar   : min=0.0, max=100.0, avg=55.8
   Coherence : min=50.0, max=96.0, avg=72.8
   Topic     : min=36.0, max=96.7, avg=69.8
   Quality   : min=70.0, max=90.0, avg=81.8
   Diversity : min=80.0, max=100.0, avg=96.7
   TOTAL     : min=57.273, max=83.364, avg=73.211




   [Call #5 for step 35: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.8s):
   Grammar   : min=0.0, max=75.0, avg=25.4
   Coherence : min=55.0, max=100.0, avg=81.0
   Topic     : min=10.0, max=100.0, avg=75.5
   Quality   : min=50.0, max=100.0, avg=82.8
   Diversity : min=88.0, max=100.0, avg=99.0
   TOTAL     : min=46.364, max=76.818, avg=62.657




   [Call #6 for step 35: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 12/33 (36.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 10/33 (30.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (31.3s):
   Grammar   : min=50.0, max=90.0, avg=71.6
   Coherence : min=40.0, max=100.0, avg=79.3
   Topic     : min=27.5, max=100.0, avg=79.7
   Quality   : min=50.0, max=100.0, avg=75.0
   Diversity : min=30.0, max=100.0, avg=88.2
   TOTAL     : min=0.000, max=71.818, avg=59.668





🔍 Step 35 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 36: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.8s):
   Grammar   : min=16.7, max=80.0, avg=51.3
   Coherence : min=42.5, max=95.0, avg=73.1
   Topic     : min=0.0, max=98.0, avg=44.1
   Quality   : min=75.0, max=90.0, avg=85.1
   Diversity : min=80.0, max=100.0, avg=97.1
   TOTAL     : min=39.091, max=80.182, avg=63.559




   [Call #2 for step 36: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.6s):
   Grammar   : min=30.0, max=100.0, avg=65.3
   Coherence : min=50.0, max=100.0, avg=71.5
   Topic     : min=10.0, max=100.0, avg=69.7
   Quality   : min=30.0, max=100.0, avg=78.5
   Diversity : min=80.0, max=100.0, avg=98.3
   TOTAL     : min=54.545, max=84.545, avg=68.472




   [Call #3 for step 36: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 11/30 (36.7%)
      🟢 Openai: 10/30 (33.3%)
      ❓ Deepseek: 9/30 (30.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (29.0s):
   Grammar   : min=0.0, max=66.7, avg=41.3
   Coherence : min=25.0, max=95.0, avg=66.1
   Topic     : min=20.0, max=100.0, avg=63.4
   Quality   : min=75.0, max=100.0, avg=87.6
   Diversity : min=80.0, max=100.0, avg=98.0
   TOTAL     : min=0.000, max=74.318, avg=53.199




   [Call #4 for step 36: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.8s):
   Grammar   : min=40.0, max=90.0, avg=64.3
   Coherence : min=20.0, max=100.0, avg=77.8
   Topic     : min=0.0, max=100.0, avg=62.6
   Quality   : min=50.0, max=100.0, avg=88.2
   Diversity : min=70.0, max=100.0, avg=92.5
   TOTAL     : min=60.000, max=86.970, avg=70.491




   [Call #5 for step 36: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (34.1s):
   Grammar   : min=0.0, max=80.0, avg=37.0
   Coherence : min=62.5, max=100.0, avg=87.6
   Topic     : min=15.0, max=100.0, avg=79.6
   Quality   : min=50.0, max=100.0, avg=89.2
   Diversity : min=30.0, max=100.0, avg=85.8
   TOTAL     : min=40.909, max=85.000, avg=59.347




   [Call #6 for step 36: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.0s):
   Grammar   : min=45.0, max=90.0, avg=63.5
   Coherence : min=60.0, max=100.0, avg=79.2
   Topic     : min=10.0, max=100.0, avg=62.9
   Quality   : min=50.0, max=100.0, avg=79.4
   Diversity : min=80.0, max=100.0, avg=89.6
   TOTAL     : min=50.227, max=74.773, avg=64.470





🔍 Step 36 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 37: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.8s):
   Grammar   : min=0.0, max=90.0, avg=42.0
   Coherence : min=20.0, max=96.0, avg=68.9
   Topic     : min=70.0, max=100.0, avg=88.3
   Quality   : min=50.0, max=100.0, avg=79.4
   Diversity : min=30.0, max=100.0, avg=89.2
   TOTAL     : min=32.727, max=88.273, avg=63.530




   [Call #2 for step 37: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.0s):
   Grammar   : min=0.0, max=100.0, avg=41.8
   Coherence : min=20.0, max=100.0, avg=65.6
   Topic     : min=40.0, max=100.0, avg=80.2
   Quality   : min=80.0, max=100.0, avg=90.0
   Diversity : min=30.0, max=100.0, avg=78.2
   TOTAL     : min=30.000, max=86.091, avg=60.109




   [Call #3 for step 37: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.5s):
   Grammar   : min=0.0, max=90.0, avg=56.7
   Coherence : min=42.0, max=100.0, avg=78.9
   Topic     : min=40.0, max=100.0, avg=79.2
   Quality   : min=66.7, max=100.0, avg=87.4
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=47.576, max=85.091, avg=67.338




   [Call #4 for step 37: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.4s):
   Grammar   : min=33.3, max=100.0, avg=68.0
   Coherence : min=42.5, max=100.0, avg=77.4
   Topic     : min=60.0, max=100.0, avg=88.8
   Quality   : min=50.0, max=87.5, avg=73.9
   Diversity : min=70.0, max=100.0, avg=92.5
   TOTAL     : min=48.636, max=71.061, avg=61.458




   [Call #5 for step 37: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.2s):
   Grammar   : min=26.7, max=100.0, avg=56.9
   Coherence : min=50.0, max=90.0, avg=74.1
   Topic     : min=6.7, max=100.0, avg=76.1
   Quality   : min=66.7, max=100.0, avg=86.9
   Diversity : min=60.0, max=100.0, avg=96.7
   TOTAL     : min=41.212, max=83.000, avg=63.807




   [Call #6 for step 37: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.1s):
   Grammar   : min=45.0, max=90.0, avg=72.7
   Coherence : min=66.7, max=100.0, avg=88.1
   Topic     : min=20.0, max=92.5, avg=50.6
   Quality   : min=70.0, max=100.0, avg=86.9
   Diversity : min=30.0, max=100.0, avg=81.5
   TOTAL     : min=41.818, max=79.205, avg=67.155





🔍 Step 37 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 38: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.7s):
   Grammar   : min=0.0, max=90.0, avg=51.7
   Coherence : min=54.0, max=86.0, avg=73.8
   Topic     : min=4.0, max=95.0, avg=62.7
   Quality   : min=70.0, max=100.0, avg=84.4
   Diversity : min=80.0, max=100.0, avg=97.3
   TOTAL     : min=53.409, max=78.727, avg=64.706




   [Call #2 for step 38: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.2s):
   Grammar   : min=0.0, max=90.0, avg=44.0
   Coherence : min=43.3, max=100.0, avg=77.4
   Topic     : min=30.0, max=100.0, avg=86.7
   Quality   : min=50.0, max=100.0, avg=79.7
   Diversity : min=80.0, max=100.0, avg=95.4
   TOTAL     : min=43.977, max=74.773, avg=60.944




   [Call #3 for step 38: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.6s):
   Grammar   : min=0.0, max=100.0, avg=50.8
   Coherence : min=30.0, max=100.0, avg=69.1
   Topic     : min=30.0, max=100.0, avg=83.1
   Quality   : min=66.7, max=100.0, avg=87.7
   Diversity : min=80.0, max=100.0, avg=98.3
   TOTAL     : min=44.545, max=82.182, avg=64.707




   [Call #4 for step 38: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.6s):
   Grammar   : min=0.0, max=100.0, avg=64.1
   Coherence : min=50.0, max=100.0, avg=79.2
   Topic     : min=4.0, max=100.0, avg=75.1
   Quality   : min=81.0, max=100.0, avg=93.4
   Diversity : min=88.0, max=100.0, avg=98.0
   TOTAL     : min=66.364, max=86.000, avg=78.212




   [Call #5 for step 38: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (32.0s):
   Grammar   : min=25.0, max=100.0, avg=62.1
   Coherence : min=40.0, max=100.0, avg=68.3
   Topic     : min=0.0, max=100.0, avg=60.3
   Quality   : min=50.0, max=100.0, avg=83.7
   Diversity : min=30.0, max=100.0, avg=82.5
   TOTAL     : min=41.364, max=84.273, avg=62.058




   [Call #6 for step 38: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.1s):
   Grammar   : min=0.0, max=88.0, avg=52.4
   Coherence : min=50.0, max=96.0, avg=71.2
   Topic     : min=44.0, max=96.7, avg=76.3
   Quality   : min=52.0, max=90.0, avg=80.3
   Diversity : min=88.0, max=100.0, avg=99.0
   TOTAL     : min=47.636, max=82.000, avg=68.746





🔍 Step 38 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 39: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.0s):
   Grammar   : min=0.0, max=100.0, avg=56.7
   Coherence : min=53.3, max=100.0, avg=75.1
   Topic     : min=40.0, max=100.0, avg=68.7
   Quality   : min=70.0, max=87.5, avg=81.2
   Diversity : min=85.0, max=100.0, avg=98.8
   TOTAL     : min=46.477, max=75.455, avg=61.263




   [Call #2 for step 39: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.6s):
   Grammar   : min=40.0, max=90.0, avg=66.1
   Coherence : min=60.0, max=100.0, avg=87.2
   Topic     : min=20.0, max=100.0, avg=73.1
   Quality   : min=55.0, max=100.0, avg=87.2
   Diversity : min=80.0, max=100.0, avg=96.7
   TOTAL     : min=43.750, max=96.061, avg=63.040




   [Call #3 for step 39: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.8s):
   Grammar   : min=0.0, max=90.0, avg=49.4
   Coherence : min=66.0, max=100.0, avg=86.6
   Topic     : min=58.0, max=98.0, avg=78.9
   Quality   : min=71.0, max=100.0, avg=87.2
   Diversity : min=88.0, max=100.0, avg=99.0
   TOTAL     : min=56.273, max=92.909, avg=68.614




   [Call #4 for step 39: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.5s):
   Grammar   : min=0.0, max=90.0, avg=56.5
   Coherence : min=50.0, max=100.0, avg=78.5
   Topic     : min=6.7, max=100.0, avg=78.5
   Quality   : min=66.7, max=100.0, avg=86.1
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=57.727, max=81.364, avg=69.934




   [Call #5 for step 39: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 9/12 valid JSON (75.0%), 0 empty, 3 failed → 9 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (27 total requests):
      ⚡ Groq: 10/27 (37.0%)
      🟢 Openai: 9/27 (33.3%)
      ❓ Deepseek: 8/27 (29.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (33.9s):
   Grammar   : min=0.0, max=90.0, avg=46.5
   Coherence : min=46.7, max=93.3, avg=67.0
   Topic     : min=24.0, max=100.0, avg=78.2
   Quality   : min=82.0, max=100.0, avg=89.1
   Diversity : min=86.7, max=100.0, avg=97.0
   TOTAL     : min=0.000, max=84.091, avg=51.586




   [Call #6 for step 39: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 18/36 (50.0%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 6/36 (16.7%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.6s):
   Grammar   : min=18.0, max=72.0, avg=56.2
   Coherence : min=50.0, max=100.0, avg=78.7
   Topic     : min=44.0, max=98.0, avg=80.1
   Quality   : min=70.0, max=90.0, avg=82.1
   Diversity : min=88.0, max=100.0, avg=98.0
   TOTAL     : min=46.455, max=87.727, avg=74.405





🔍 Step 39 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 40: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 9/12 valid JSON (75.0%), 0 empty, 3 failed → 9 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (27 total requests):
      ⚡ Groq: 11/27 (40.7%)
      🟢 Openai: 9/27 (33.3%)
      ❓ Deepseek: 7/27 (25.9%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (30.6s):
   Grammar   : min=45.0, max=90.0, avg=63.6
   Coherence : min=60.0, max=100.0, avg=82.5
   Topic     : min=10.0, max=100.0, avg=71.7
   Quality   : min=50.0, max=100.0, avg=83.3
   Diversity : min=80.0, max=100.0, avg=97.8
   TOTAL     : min=0.000, max=84.659, avg=50.805




   [Call #2 for step 40: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 17/36 (47.2%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 7/36 (19.4%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.6s):
   Grammar   : min=40.0, max=100.0, avg=73.8
   Coherence : min=57.5, max=92.0, avg=78.3
   Topic     : min=52.5, max=100.0, avg=84.5
   Quality   : min=82.0, max=100.0, avg=92.3
   Diversity : min=80.0, max=100.0, avg=96.1
   TOTAL     : min=61.023, max=91.061, avg=78.972




   [Call #3 for step 40: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.6s):
   Grammar   : min=0.0, max=100.0, avg=39.7
   Coherence : min=56.7, max=100.0, avg=86.5
   Topic     : min=0.0, max=100.0, avg=66.8
   Quality   : min=66.7, max=100.0, avg=87.2
   Diversity : min=80.0, max=100.0, avg=97.3
   TOTAL     : min=53.409, max=84.818, avg=68.196




   [Call #4 for step 40: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 12/33 (36.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 10/33 (30.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.4s):
   Grammar   : min=30.0, max=100.0, avg=66.4
   Coherence : min=72.0, max=100.0, avg=82.7
   Topic     : min=20.0, max=100.0, avg=74.4
   Quality   : min=82.0, max=100.0, avg=93.3
   Diversity : min=76.0, max=100.0, avg=96.0
   TOTAL     : min=0.000, max=88.523, avg=68.890




   [Call #5 for step 40: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.0s):
   Grammar   : min=45.0, max=75.0, avg=60.4
   Coherence : min=60.0, max=100.0, avg=79.2
   Topic     : min=80.0, max=100.0, avg=95.0
   Quality   : min=75.0, max=100.0, avg=97.9
   Diversity : min=86.7, max=100.0, avg=97.8
   TOTAL     : min=73.902, max=87.538, avg=80.234




   [Call #6 for step 40: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 11/30 (36.7%)
      🟢 Openai: 10/30 (33.3%)
      ❓ Deepseek: 9/30 (30.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (35.5s):
   Grammar   : min=0.0, max=63.3, avg=38.3
   Coherence : min=60.0, max=93.3, avg=78.3
   Topic     : min=6.7, max=100.0, avg=62.0
   Quality   : min=66.7, max=100.0, avg=84.9
   Diversity : min=86.7, max=100.0, avg=98.7
   TOTAL     : min=0.000, max=78.818, avg=50.564





🔍 Step 40 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 41: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 13/33 (39.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 9/33 (27.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.0s):
   Grammar   : min=0.0, max=82.0, avg=54.9
   Coherence : min=53.3, max=100.0, avg=76.2
   Topic     : min=68.0, max=100.0, avg=88.3
   Quality   : min=53.3, max=90.0, avg=82.2
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=0.000, max=89.273, avg=61.966




   [Call #2 for step 41: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.3s):
   Grammar   : min=22.5, max=80.0, avg=55.4
   Coherence : min=50.0, max=100.0, avg=76.0
   Topic     : min=40.0, max=100.0, avg=86.2
   Quality   : min=50.0, max=100.0, avg=83.3
   Diversity : min=70.0, max=100.0, avg=91.4
   TOTAL     : min=60.227, max=87.500, avg=74.265




   [Call #3 for step 41: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 9/12 valid JSON (75.0%), 0 empty, 3 failed → 9 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (27 total requests):
      🟢 Openai: 9/27 (33.3%)
      ⚡ Groq: 9/27 (33.3%)
      ❓ Deepseek: 9/27 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (32.2s):
   Grammar   : min=46.7, max=100.0, avg=71.5
   Coherence : min=50.0, max=93.3, avg=68.9
   Topic     : min=46.7, max=100.0, avg=80.4
   Quality   : min=70.0, max=83.3, avg=78.9
   Diversity : min=80.0, max=100.0, avg=97.8
   TOTAL     : min=0.000, max=85.303, avg=50.896




   [Call #4 for step 41: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 12/33 (36.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 10/33 (30.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.9s):
   Grammar   : min=36.0, max=80.0, avg=63.3
   Coherence : min=54.0, max=100.0, avg=79.5
   Topic     : min=20.0, max=100.0, avg=74.7
   Quality   : min=87.5, max=100.0, avg=95.9
   Diversity : min=88.0, max=100.0, avg=96.7
   TOTAL     : min=0.000, max=92.818, avg=65.381




   [Call #5 for step 41: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.3s):
   Grammar   : min=0.0, max=90.0, avg=54.7
   Coherence : min=40.0, max=100.0, avg=73.7
   Topic     : min=16.0, max=100.0, avg=62.2
   Quality   : min=50.0, max=90.0, avg=77.2
   Diversity : min=70.0, max=100.0, avg=90.5
   TOTAL     : min=52.545, max=76.182, avg=65.659




   [Call #6 for step 41: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.1s):
   Grammar   : min=0.0, max=90.0, avg=48.9
   Coherence : min=73.3, max=94.0, avg=82.8
   Topic     : min=37.5, max=100.0, avg=80.2
   Quality   : min=83.3, max=100.0, avg=88.9
   Diversity : min=86.7, max=100.0, avg=96.7
   TOTAL     : min=59.432, max=81.818, avg=71.711





🔍 Step 41 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 42: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.5s):
   Grammar   : min=25.0, max=100.0, avg=62.5
   Coherence : min=57.5, max=100.0, avg=82.1
   Topic     : min=80.0, max=100.0, avg=92.9
   Quality   : min=65.0, max=87.5, avg=82.5
   Diversity : min=85.0, max=100.0, avg=96.4
   TOTAL     : min=69.205, max=84.015, avg=74.908




   [Call #2 for step 42: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.3s):
   Grammar   : min=25.0, max=83.3, avg=56.9
   Coherence : min=50.0, max=94.0, avg=76.6
   Topic     : min=46.7, max=100.0, avg=82.1
   Quality   : min=55.0, max=100.0, avg=81.1
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=50.455, max=74.818, avg=61.920




   [Call #3 for step 42: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      🟢 Openai: 10/30 (33.3%)
      ⚡ Groq: 10/30 (33.3%)
      ❓ Deepseek: 10/30 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (34.7s):
   Grammar   : min=45.0, max=82.0, avg=59.8
   Coherence : min=30.0, max=100.0, avg=73.9
   Topic     : min=20.0, max=100.0, avg=66.6
   Quality   : min=55.0, max=100.0, avg=86.5
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=0.000, max=85.727, avg=53.045




   [Call #4 for step 42: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      🟢 Openai: 11/33 (33.3%)
      ⚡ Groq: 11/33 (33.3%)
      ❓ Deepseek: 11/33 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.4s):
   Grammar   : min=0.0, max=90.0, avg=66.6
   Coherence : min=46.7, max=100.0, avg=79.7
   Topic     : min=13.3, max=100.0, avg=78.2
   Quality   : min=50.0, max=100.0, avg=75.1
   Diversity : min=70.0, max=100.0, avg=88.9
   TOTAL     : min=0.000, max=75.000, avg=61.089




   [Call #5 for step 42: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.7s):
   Grammar   : min=0.0, max=90.0, avg=52.2
   Coherence : min=43.3, max=95.0, avg=76.7
   Topic     : min=13.3, max=100.0, avg=70.2
   Quality   : min=66.7, max=100.0, avg=81.4
   Diversity : min=85.0, max=100.0, avg=96.5
   TOTAL     : min=38.939, max=83.864, avg=68.077




   [Call #6 for step 42: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.0s):
   Grammar   : min=0.0, max=90.0, avg=59.1
   Coherence : min=60.0, max=96.0, avg=76.0
   Topic     : min=20.0, max=100.0, avg=74.4
   Quality   : min=75.0, max=100.0, avg=91.5
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=46.818, max=93.455, avg=72.383





🔍 Step 42 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 43: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.3s):
   Grammar   : min=33.3, max=100.0, avg=66.3
   Coherence : min=32.5, max=93.3, avg=69.1
   Topic     : min=46.7, max=100.0, avg=80.4
   Quality   : min=83.3, max=100.0, avg=93.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=63.182, max=82.424, avg=73.141




   [Call #2 for step 43: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.7s):
   Grammar   : min=0.0, max=70.0, avg=44.2
   Coherence : min=42.5, max=93.3, avg=62.6
   Topic     : min=13.3, max=100.0, avg=64.6
   Quality   : min=62.5, max=100.0, avg=85.0
   Diversity : min=80.0, max=100.0, avg=98.3
   TOTAL     : min=53.182, max=73.788, avg=61.894




   [Call #3 for step 43: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (46.7s):
   Grammar   : min=0.0, max=67.5, avg=33.2
   Coherence : min=42.5, max=93.3, avg=67.6
   Topic     : min=66.7, max=100.0, avg=88.8
   Quality   : min=75.0, max=100.0, avg=86.0
   Diversity : min=70.0, max=100.0, avg=95.1
   TOTAL     : min=48.864, max=75.909, avg=63.374




   [Call #4 for step 43: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 13/30 (43.3%)
      🟢 Openai: 10/30 (33.3%)
      ❓ Deepseek: 7/30 (23.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.1s):
   Grammar   : min=0.0, max=50.0, avg=28.0
   Coherence : min=22.0, max=100.0, avg=66.5
   Topic     : min=0.0, max=88.0, avg=27.1
   Quality   : min=83.3, max=100.0, avg=94.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=0.000, max=74.000, avg=52.395




   [Call #5 for step 43: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (49.0s):
   Grammar   : min=22.5, max=90.0, avg=50.4
   Coherence : min=40.0, max=100.0, avg=70.7
   Topic     : min=12.0, max=100.0, avg=72.5
   Quality   : min=66.7, max=100.0, avg=87.5
   Diversity : min=86.7, max=100.0, avg=98.9
   TOTAL     : min=55.758, max=82.727, avg=68.794




   [Call #6 for step 43: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 12/30 (40.0%)
      🟢 Openai: 10/30 (33.3%)
      ❓ Deepseek: 8/30 (26.7%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.1s):
   Grammar   : min=0.0, max=90.0, avg=49.5
   Coherence : min=40.0, max=100.0, avg=66.9
   Topic     : min=65.0, max=100.0, avg=88.4
   Quality   : min=62.5, max=100.0, avg=90.4
   Diversity : min=80.0, max=100.0, avg=95.5
   TOTAL     : min=0.000, max=88.068, avg=60.324





🔍 Step 43 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 44: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.7s):
   Grammar   : min=0.0, max=100.0, avg=71.0
   Coherence : min=30.0, max=100.0, avg=79.6
   Topic     : min=13.3, max=100.0, avg=66.5
   Quality   : min=59.0, max=100.0, avg=83.2
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=59.394, max=82.500, avg=73.547




   [Call #2 for step 44: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 13/33 (39.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 9/33 (27.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.9s):
   Grammar   : min=0.0, max=90.0, avg=64.9
   Coherence : min=55.0, max=100.0, avg=81.1
   Topic     : min=25.0, max=100.0, avg=46.5
   Quality   : min=87.5, max=100.0, avg=92.3
   Diversity : min=80.0, max=100.0, avg=94.5
   TOTAL     : min=0.000, max=89.636, avg=66.367




   [Call #3 for step 44: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 13/33 (39.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 9/33 (27.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.3s):
   Grammar   : min=0.0, max=56.0, avg=23.5
   Coherence : min=42.5, max=100.0, avg=64.7
   Topic     : min=57.5, max=100.0, avg=80.5
   Quality   : min=30.0, max=100.0, avg=70.7
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=0.000, max=68.364, avg=50.989




   [Call #4 for step 44: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 14/33 (42.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 8/33 (24.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.7s):
   Grammar   : min=0.0, max=90.0, avg=57.9
   Coherence : min=56.7, max=100.0, avg=77.7
   Topic     : min=46.0, max=100.0, avg=89.2
   Quality   : min=50.0, max=100.0, avg=80.3
   Diversity : min=80.0, max=100.0, avg=97.1
   TOTAL     : min=0.000, max=84.909, avg=70.337




   [Call #5 for step 44: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 12/33 (36.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 10/33 (30.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (41.9s):
   Grammar   : min=33.3, max=75.0, avg=56.7
   Coherence : min=52.0, max=100.0, avg=77.7
   Topic     : min=10.0, max=100.0, avg=66.0
   Quality   : min=83.3, max=100.0, avg=90.4
   Diversity : min=86.7, max=100.0, avg=98.8
   TOTAL     : min=0.000, max=85.636, avg=64.960




   [Call #6 for step 44: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 12/33 (36.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 10/33 (30.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.3s):
   Grammar   : min=25.0, max=100.0, avg=52.3
   Coherence : min=37.5, max=100.0, avg=63.6
   Topic     : min=20.0, max=97.5, avg=70.2
   Quality   : min=66.7, max=100.0, avg=84.7
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=0.000, max=82.364, avg=59.508





🔍 Step 44 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 45: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (40.7s):
   Grammar   : min=0.0, max=90.0, avg=56.5
   Coherence : min=30.0, max=100.0, avg=72.4
   Topic     : min=33.3, max=100.0, avg=64.1
   Quality   : min=66.7, max=100.0, avg=88.8
   Diversity : min=85.0, max=100.0, avg=98.8
   TOTAL     : min=64.091, max=84.545, avg=73.753




   [Call #2 for step 45: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 16/30 (53.3%)
      🟢 Openai: 10/30 (33.3%)
      ❓ Deepseek: 4/30 (13.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.8s):
   Grammar   : min=0.0, max=72.0, avg=48.2
   Coherence : min=45.0, max=90.0, avg=73.5
   Topic     : min=60.0, max=100.0, avg=87.2
   Quality   : min=62.5, max=100.0, avg=86.2
   Diversity : min=80.0, max=100.0, avg=92.8
   TOTAL     : min=0.000, max=88.364, avg=57.727




   [Call #3 for step 45: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                              

   Parse stats: 5/12 valid JSON (41.7%), 0 empty, 7 failed → 5 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (15 total requests):
      ⚡ Groq: 7/15 (46.7%)
      🟢 Openai: 5/15 (33.3%)
      ❓ Deepseek: 3/15 (20.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (26.2s):
   Grammar   : min=20.0, max=60.0, avg=42.0
   Coherence : min=36.0, max=90.0, avg=72.4
   Topic     : min=44.0, max=94.0, avg=76.8
   Quality   : min=90.0, max=100.0, avg=98.0
   Diversity : min=80.0, max=100.0, avg=90.7
   TOTAL     : min=0.000, max=86.242, avg=31.184




   [Call #4 for step 45: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.1s):
   Grammar   : min=0.0, max=90.0, avg=56.4
   Coherence : min=40.0, max=94.0, avg=71.4
   Topic     : min=30.0, max=100.0, avg=74.2
   Quality   : min=42.0, max=100.0, avg=78.8
   Diversity : min=70.0, max=100.0, avg=88.2
   TOTAL     : min=52.455, max=80.152, avg=71.058




   [Call #5 for step 45: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 12/30 (40.0%)
      🟢 Openai: 10/30 (33.3%)
      ❓ Deepseek: 8/30 (26.7%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (34.7s):
   Grammar   : min=33.3, max=90.0, avg=63.1
   Coherence : min=46.7, max=86.0, avg=75.5
   Topic     : min=50.0, max=100.0, avg=85.6
   Quality   : min=66.7, max=100.0, avg=85.3
   Diversity : min=86.7, max=100.0, avg=92.0
   TOTAL     : min=0.000, max=86.061, avg=60.419




   [Call #6 for step 45: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 9/12 valid JSON (75.0%), 0 empty, 3 failed → 9 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (27 total requests):
      ⚡ Groq: 10/27 (37.0%)
      🟢 Openai: 9/27 (33.3%)
      ❓ Deepseek: 8/27 (29.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (35.9s):
   Grammar   : min=0.0, max=90.0, avg=60.9
   Coherence : min=50.0, max=100.0, avg=78.3
   Topic     : min=0.0, max=96.0, avg=34.6
   Quality   : min=80.0, max=100.0, avg=95.0
   Diversity : min=80.0, max=100.0, avg=96.3
   TOTAL     : min=0.000, max=83.182, avg=54.490





🔍 Step 45 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 46: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.1s):
   Grammar   : min=0.0, max=100.0, avg=55.4
   Coherence : min=30.0, max=100.0, avg=74.2
   Topic     : min=55.0, max=100.0, avg=79.6
   Quality   : min=75.0, max=100.0, avg=92.1
   Diversity : min=80.0, max=100.0, avg=90.6
   TOTAL     : min=51.136, max=80.455, avg=70.391




   [Call #2 for step 46: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 12/33 (36.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 10/33 (30.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.7s):
   Grammar   : min=0.0, max=90.0, avg=55.8
   Coherence : min=20.0, max=100.0, avg=80.2
   Topic     : min=20.0, max=100.0, avg=69.2
   Quality   : min=87.5, max=100.0, avg=93.0
   Diversity : min=70.0, max=100.0, avg=87.0
   TOTAL     : min=0.000, max=84.970, avg=63.914




   [Call #3 for step 46: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 14/30 (46.7%)
      🟢 Openai: 10/30 (33.3%)
      ❓ Deepseek: 6/30 (20.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.5s):
   Grammar   : min=10.0, max=100.0, avg=65.5
   Coherence : min=62.5, max=100.0, avg=80.7
   Topic     : min=60.0, max=96.0, avg=83.3
   Quality   : min=80.0, max=100.0, avg=91.2
   Diversity : min=86.7, max=100.0, avg=96.0
   TOTAL     : min=0.000, max=88.061, avg=62.708




   [Call #4 for step 46: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (43.8s):
   Grammar   : min=0.0, max=90.0, avg=46.7
   Coherence : min=20.0, max=100.0, avg=76.9
   Topic     : min=10.0, max=100.0, avg=70.4
   Quality   : min=50.0, max=100.0, avg=74.0
   Diversity : min=70.0, max=100.0, avg=90.3
   TOTAL     : min=49.773, max=86.818, avg=65.073




   [Call #5 for step 46: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (44.3s):
   Grammar   : min=0.0, max=100.0, avg=63.0
   Coherence : min=46.7, max=100.0, avg=78.8
   Topic     : min=35.0, max=100.0, avg=64.1
   Quality   : min=60.0, max=100.0, avg=95.8
   Diversity : min=80.0, max=100.0, avg=91.7
   TOTAL     : min=63.636, max=81.136, avg=71.875




   [Call #6 for step 46: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      🟢 Openai: 10/30 (33.3%)
      ⚡ Groq: 10/30 (33.3%)
      ❓ Deepseek: 10/30 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (32.2s):
   Grammar   : min=20.0, max=100.0, avg=65.8
   Coherence : min=50.0, max=100.0, avg=83.2
   Topic     : min=10.0, max=98.0, avg=59.1
   Quality   : min=75.0, max=100.0, avg=91.8
   Diversity : min=80.0, max=100.0, avg=96.0
   TOTAL     : min=0.000, max=80.000, avg=58.215





🔍 Step 46 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 47: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 17/36 (47.2%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 7/36 (19.4%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (45.7s):
   Grammar   : min=0.0, max=100.0, avg=49.8
   Coherence : min=54.0, max=88.0, avg=72.0
   Topic     : min=30.0, max=100.0, avg=78.1
   Quality   : min=80.0, max=100.0, avg=90.6
   Diversity : min=86.7, max=100.0, avg=97.8
   TOTAL     : min=58.545, max=85.455, avg=70.393




   [Call #2 for step 47: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 13/33 (39.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 9/33 (27.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (35.8s):
   Grammar   : min=0.0, max=90.0, avg=57.2
   Coherence : min=42.5, max=100.0, avg=74.4
   Topic     : min=20.0, max=100.0, avg=64.3
   Quality   : min=40.0, max=100.0, avg=82.5
   Diversity : min=70.0, max=100.0, avg=88.0
   TOTAL     : min=0.000, max=90.152, avg=63.340




   [Call #3 for step 47: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 14/30 (46.7%)
      🟢 Openai: 10/30 (33.3%)
      ❓ Deepseek: 6/30 (20.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.5s):
   Grammar   : min=0.0, max=90.0, avg=53.1
   Coherence : min=52.0, max=96.0, avg=76.6
   Topic     : min=14.0, max=100.0, avg=68.5
   Quality   : min=42.0, max=100.0, avg=75.7
   Diversity : min=86.7, max=100.0, avg=97.5
   TOTAL     : min=0.000, max=83.788, avg=61.797




   [Call #4 for step 47: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 12/33 (36.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 10/33 (30.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.0s):
   Grammar   : min=0.0, max=65.0, avg=37.8
   Coherence : min=60.0, max=100.0, avg=82.7
   Topic     : min=20.0, max=100.0, avg=66.5
   Quality   : min=77.5, max=100.0, avg=94.8
   Diversity : min=80.0, max=100.0, avg=93.3
   TOTAL     : min=0.000, max=78.818, avg=67.122




   [Call #5 for step 47: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (33.6s):
   Grammar   : min=0.0, max=90.0, avg=29.7
   Coherence : min=20.0, max=95.0, avg=65.2
   Topic     : min=20.0, max=100.0, avg=76.1
   Quality   : min=50.0, max=100.0, avg=81.0
   Diversity : min=70.0, max=100.0, avg=83.3
   TOTAL     : min=57.273, max=94.000, avg=68.797




   [Call #6 for step 47: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 10/12 valid JSON (83.3%), 0 empty, 2 failed → 10 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (30 total requests):
      ⚡ Groq: 11/30 (36.7%)
      🟢 Openai: 10/30 (33.3%)
      ❓ Deepseek: 9/30 (30.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (34.2s):
   Grammar   : min=30.0, max=76.7, avg=54.7
   Coherence : min=36.0, max=76.7, avg=62.9
   Topic     : min=56.7, max=100.0, avg=80.6
   Quality   : min=50.0, max=100.0, avg=74.4
   Diversity : min=76.0, max=100.0, avg=93.2
   TOTAL     : min=0.000, max=67.879, avg=49.686





🔍 Step 47 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 48: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.3s):
   Grammar   : min=25.0, max=90.0, avg=52.5
   Coherence : min=50.0, max=100.0, avg=73.9
   Topic     : min=40.0, max=100.0, avg=70.4
   Quality   : min=83.3, max=100.0, avg=98.6
   Diversity : min=80.0, max=100.0, avg=98.3
   TOTAL     : min=50.909, max=93.727, avg=73.413




   [Call #2 for step 48: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (42.9s):
   Grammar   : min=0.0, max=90.0, avg=33.9
   Coherence : min=20.0, max=96.0, avg=64.5
   Topic     : min=25.0, max=100.0, avg=74.1
   Quality   : min=40.0, max=100.0, avg=61.0
   Diversity : min=76.0, max=100.0, avg=98.0
   TOTAL     : min=52.500, max=82.909, avg=63.928




   [Call #3 for step 48: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 14/33 (42.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 8/33 (24.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.9s):
   Grammar   : min=28.0, max=90.0, avg=69.2
   Coherence : min=20.0, max=100.0, avg=66.9
   Topic     : min=46.0, max=100.0, avg=78.6
   Quality   : min=10.0, max=100.0, avg=66.4
   Diversity : min=70.0, max=100.0, avg=91.8
   TOTAL     : min=0.000, max=81.932, avg=64.275




   [Call #4 for step 48: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 12/33 (36.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 10/33 (30.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.0s):
   Grammar   : min=0.0, max=70.0, avg=51.0
   Coherence : min=40.0, max=100.0, avg=70.6
   Topic     : min=5.0, max=98.0, avg=61.7
   Quality   : min=50.0, max=100.0, avg=84.1
   Diversity : min=86.7, max=100.0, avg=98.8
   TOTAL     : min=0.000, max=72.045, avg=58.126




   [Call #5 for step 48: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (33.4s):
   Grammar   : min=50.0, max=100.0, avg=76.2
   Coherence : min=20.0, max=100.0, avg=73.9
   Topic     : min=20.0, max=100.0, avg=74.4
   Quality   : min=75.0, max=100.0, avg=96.2
   Diversity : min=70.0, max=100.0, avg=91.4
   TOTAL     : min=71.818, max=89.886, avg=80.306




   [Call #6 for step 48: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 13/36 (36.1%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 11/36 (30.6%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.8s):
   Grammar   : min=0.0, max=100.0, avg=24.2
   Coherence : min=27.5, max=100.0, avg=64.6
   Topic     : min=60.0, max=100.0, avg=85.3
   Quality   : min=42.0, max=100.0, avg=68.1
   Diversity : min=80.0, max=100.0, avg=94.0
   TOTAL     : min=52.727, max=78.409, avg=63.782





🔍 Step 48 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 49: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 16/36 (44.4%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 8/36 (22.2%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (39.7s):
   Grammar   : min=0.0, max=72.0, avg=33.4
   Coherence : min=20.0, max=84.0, avg=60.1
   Topic     : min=50.0, max=100.0, avg=77.8
   Quality   : min=50.0, max=100.0, avg=85.6
   Diversity : min=86.7, max=100.0, avg=96.7
   TOTAL     : min=54.773, max=87.333, avg=65.484




   [Call #2 for step 49: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (34.3s):
   Grammar   : min=0.0, max=100.0, avg=53.3
   Coherence : min=60.0, max=100.0, avg=73.8
   Topic     : min=30.0, max=100.0, avg=77.1
   Quality   : min=75.0, max=100.0, avg=91.0
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=46.364, max=89.091, avg=73.608




   [Call #3 for step 49: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      🟢 Openai: 12/36 (33.3%)
      ⚡ Groq: 12/36 (33.3%)
      ❓ Deepseek: 12/36 (33.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (36.1s):
   Grammar   : min=0.0, max=90.0, avg=46.2
   Coherence : min=35.0, max=100.0, avg=70.2
   Topic     : min=10.0, max=55.0, avg=26.9
   Quality   : min=50.0, max=100.0, avg=72.9
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=54.091, max=75.455, avg=64.886




   [Call #4 for step 49: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 15/36 (41.7%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 9/36 (25.0%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (37.3s):
   Grammar   : min=0.0, max=100.0, avg=43.0
   Coherence : min=60.0, max=100.0, avg=82.0
   Topic     : min=0.0, max=100.0, avg=44.8
   Quality   : min=75.0, max=100.0, avg=93.8
   Diversity : min=100.0, max=100.0, avg=100.0
   TOTAL     : min=49.773, max=90.341, avg=70.447




   [Call #5 for step 49: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 11/12 valid JSON (91.7%), 0 empty, 1 failed → 11 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (33 total requests):
      ⚡ Groq: 12/33 (36.4%)
      🟢 Openai: 11/33 (33.3%)
      ❓ Deepseek: 10/33 (30.3%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (30.0s):
   Grammar   : min=0.0, max=90.0, avg=46.2
   Coherence : min=25.0, max=98.0, avg=58.4
   Topic     : min=0.0, max=100.0, avg=41.5
   Quality   : min=75.0, max=100.0, avg=79.5
   Diversity : min=86.7, max=100.0, avg=97.6
   TOTAL     : min=0.000, max=77.727, avg=62.665




   [Call #6 for step 49: scoring 12 completions]

⏳ Step 1/3: Parsing 12 JSON completions...


                                                    

   Parse stats: 12/12 valid JSON (100.0%), 0 empty, 0 failed → 12 scorable
⏳ Step 2/3: Scoring 12 completions with batched reward function...





   📊 Model Usage Distribution (36 total requests):
      ⚡ Groq: 14/36 (38.9%)
      🟢 Openai: 12/36 (33.3%)
      ❓ Deepseek: 10/36 (27.8%)
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                           


🎯 Reward calculation complete (38.7s):
   Grammar   : min=10.0, max=100.0, avg=57.6
   Coherence : min=43.3, max=100.0, avg=66.3
   Topic     : min=50.0, max=100.0, avg=75.9
   Quality   : min=50.0, max=100.0, avg=79.2
   Diversity : min=80.0, max=100.0, avg=94.4
   TOTAL     : min=57.045, max=77.955, avg=66.676


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



📊 Running validation at step 50...

  Generating for validation sample 1/10...

🔍 Step 49 TOTAL: 6 reward calls, 72 completions scored
   [Call #1 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 2/10...




   [Call #2 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 3/10...




   [Call #3 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 4/10...




   [Call #4 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 5/10...




   [Call #5 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 6/10...




   [Call #6 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 7/10...




   [Call #7 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 8/10...




   [Call #8 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 9/10...




   [Call #9 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00

  Generating for validation sample 10/10...




   [Call #10 for step 0: scoring 1 completions]

⏳ Step 1/3: Parsing 1 JSON completions...




   Parse stats: 0/1 valid JSON (0.0%), 0 empty, 1 failed → 0 scorable
⏳ Step 2/3: Scoring 1 completions with batched reward function...

      0 models in cooldown, 0 at daily limit
      💡 TIP: Likely hitting RPM (requests/minute) limits
      💡 Solution: Reduce batch size or add request pacing
⏳ Step 3/3: Computing CPU-bound rewards and aggregating results...


                                                          


🎯 Reward calculation complete (0.0s):
   TOTAL     : min=0.000, max=0.000, avg=0.000
    Rewards: avg=0.00, max=0.00, min=0.00





📊 Validation Results at Step 50:
   Average Reward: 0.00
   Max Reward:     0.00
   Min Reward:     0.00
   Saved to: models/TeacherPet_italian_grpo/validation_results/validation_step_50.json
   ➡️ Change from step 25: +0.00


🎉 TRAINING COMPLETE

--- Saving Model ---
Output directory: ./models/TeacherPet_italian_grpo
✅ Model saved successfully.
