In [None]:
# @title Reward Model Training
'''
===========================================================================================================
This training script was originally developed and optimized for execution within Google Colab,
relying on colab-specific authentication mechanisms, and other environment-dependent utilities.
As a result, the initial implementation included secret-based Hugging Face login via Colab‚Äôs userdata API.
While these components streamlined experimentation within a Colab workflow, this also made the script
less portable and harder to reproduce in general compute environments such as local machines, cloud VMs,
or managed training clusters.

You can refactor the current version and remove the above mentioned Colab-specific assumptions,
replacing them with environment-agnostic paths, standard Hugging Face authentication, and fully
general dataset/model loading logic so the script can run consistently anywhere while retaining
the same behavior and training methodology.
===========================================================================================================
'''
# ==========================================
# 1. Install Dependencies
# ==========================================
print("‚è≥ Installing libraries...")
!pip install -q -U transformers datasets trl accelerate huggingface_hub
'''
The training was conducted using the following library versions at the time:
Accelerate: 0.28.0
Hugging Face Hub: 0.17.1
TRL: 0.25.1
Transformers: 4.57.3
Pytorch: 2.9.0+cu126
Datasets: 4.4.1
Tokenizers: 0.22.1
'''
import torch
import torch.nn as nn
import transformers
import datasets
import trl
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
from trl import RewardTrainer, RewardConfig
from huggingface_hub import login
from google.colab import userdata

# ==========================================
# 2. Setup & Login
# ==========================================
print("\nüîë Logging in...")
try:
    # Ensure you have a secret named 'HF_TOKEN' in Colab (Key icon on left)
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token, add_to_git_credential=True)
    print("‚úÖ Logged in via Colab Secret.")
except Exception:
    print("‚ö†Ô∏è Secret 'HF_TOKEN' not found. Falling back to manual input.")
    login(add_to_git_credential=True)

# ==========================================
# 3. Configuration
# ==========================================
# Model & Data
MODEL_NAME = "Qwen/Qwen3-0.6B-Base"
DATASET_NAME = "Jennny/helpsteer2-helpfulness-preference" # this is a variant of the HelpSteer2 dataset having only the helpfulness attribute
REPO_NAME = "your-username/qwen3-0.6b-RM" # model name is arbitrary

# All hyperparameters can be modified as suitable
MAX_LENGTH = 2048
BATCH_SIZE = 16
GRAD_ACCUMULATION = 2 # Effective Batch Size = 32
LEARNING_RATE = 5e-6

# ==========================================
# 4. Dataset Loading & Filtering
# ==========================================
print(f"\nwv Loading dataset: {DATASET_NAME}...")
dataset = load_dataset(DATASET_NAME, split="train")

# Identify the correct score column (varies by dataset version)
score_col = "chosen_score"
if score_col not in dataset.column_names:
    possible = [c for c in dataset.column_names if "score" in c or "rating" in c]
    if possible: score_col = possible[0]

# We filter out low-quality data so the model only learns from "helpful" examples (Score >= 3)
try:
    original_len = len(dataset)
    dataset = dataset.filter(lambda x: x[score_col] >= 3)
    print(f"‚úÖ Filtered dataset (Score >= 3): {original_len} -> {len(dataset)} samples")
except KeyError:
    print("‚ö†Ô∏è Warning: Score column not found. Skipping filter.")

# ==========================================
# 5. Robust Preprocessing
# ==========================================
print("\n‚öôÔ∏è Preprocessing...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
        "chosen": [],   # TRL needs these as strings for sanity checks
        "rejected": []  # TRL needs these as strings for sanity checks
    }

    for chosen_raw, rejected_raw in zip(examples["chosen"], examples["rejected"]):
        # Handle cases where data is a list of dicts (Conversation) vs string
        if isinstance(chosen_raw, list):
            prompt_text = chosen_raw[0]['content']
            chosen_response = chosen_raw[1]['content']
            rejected_response = rejected_raw[1]['content']
        else:
            # Fallback if already string
            prompt_text = ""
            chosen_response = str(chosen_raw)
            rejected_response = str(rejected_raw)

        # Apply manual chat template (User/Assistant) for the Base model
        # Note: We do NOT add EOS manually here. RewardTrainer adds it automatically.
        chosen_text = f"User: {prompt_text}\n\nAssistant: {chosen_response}"
        rejected_text = f"User: {prompt_text}\n\nAssistant: {rejected_response}"

        # Tokenize
        tokenized_chosen = tokenizer(chosen_text, truncation=True, max_length=MAX_LENGTH)
        tokenized_rejected = tokenizer(rejected_text, truncation=True, max_length=MAX_LENGTH)

        new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

        # Store raw text strings for TRL (Fixes KeyError: 'chosen')
        new_examples["chosen"].append(chosen_text)
        new_examples["rejected"].append(rejected_text)

    return new_examples

# Apply processing and remove old columns to avoid schema conflicts, then split
processed_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset.column_names
)
split_dataset = processed_dataset.train_test_split(test_size=0.1) # 10% prompts were used for in between loss validation check

# ==========================================
# 6. Model Loading & Initialization
# ==========================================
print(f"\nüß† Loading Model: {MODEL_NAME} (FP32)...")
# num_labels=1 converts the model head to output a single scalar score
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=1,
    torch_dtype=torch.float32, # FP32 for stability
    trust_remote_code=True,
    device_map="auto"
)
model.config.pad_token_id = tokenizer.pad_token_id

# Zero-initialize the score head for stability
# This ensures the model starts with a neutral reward bias (0.0), preventing initial instability
print("‚öñÔ∏è Stabilizing: Zero-initializing score weights...")
for name, param in model.named_parameters():
    if "score" in name or "classifier" in name:
        nn.init.constant_(param, 0.0)

# ==========================================
# 7. Training
# ==========================================
training_args = RewardConfig(
    output_dir=REPO_NAME,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=1,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    logging_steps=20,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=100,
    max_length=MAX_LENGTH,
    bf16=False, # FP32
    fp16=False, # FP32
    remove_unused_columns=False, # CRITICAL: Prevents Trainer from dropping 'chosen'/'rejected' text columns
    push_to_hub=True,
    hub_model_id=REPO_NAME,
    report_to="none",
)

trainer = RewardTrainer(
    model=model,
    processing_class=tokenizer, # 'processing_class' replaces 'tokenizer' in newer TRL versions (v0.12+)
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
)

print("\nüöÄ Starting Training...")
trainer.train()

# ==========================================
# 8. Push to Hub
# ==========================================
print("\n‚òÅÔ∏è Pushing final model to Hub...")
trainer.push_to_hub()
print(f"‚úÖ DONE! Model uploaded to: https://huggingface.co/{REPO_NAME}")