In [None]:
# Step 2: Train reward model

Train a model to score how appropriate the strictness level is for each response

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
from peft import PeftModel
import json
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import wandb
from typing import Dict, List, Tuple
import matplotlib.pyplot as plt
import seaborn as sns

# Device configuration
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")


class RewardModel(nn.Module):

  from .autonotebook import tqdm as notebook_tqdm


Using device: mps
Reward model class defined successfully.


## Data Loading and Preprocessing


References:
- Combining regression and BT: [HelpSteer2-Preference](https://arxiv.org/html/2410.01257v1)

In [2]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Paths
from pathlib import Path

# Prefer local SFT directory saved by Step 1
primary = Path("./strictbot_sft_model")
secondary = Path("FineTuneLLms/StrictBot/strictbot_sft_model")

if primary.exists():
    sft_dir = primary.resolve()
elif secondary.exists():
    sft_dir = secondary.resolve()
else:
    raise FileNotFoundError(
        f"SFT model directory not found. Checked: {primary.resolve()} and {secondary.resolve()}\n"
        "Run Step 1 first and ensure the model was saved to 'strictbot_sft_model'."
    )

reward_data_path = "enhanced_reward_model_dataset.json"

print(f"Loading SFT model for reward base from: {sft_dir}")

sft_dir_path = Path(sft_dir)
config_path = sft_dir_path / "config.json"
adapter_config_path = sft_dir_path / "adapter_config.json"

base_model_id = "Qwen/Qwen2.5-0.5B-Instruct"  # must match Step 1 base
if (sft_dir_path / "tokenizer_config.json").exists():
    tokenizer = AutoTokenizer.from_pretrained(str(sft_dir_path), local_files_only=True)
else:
    try:
        tokenizer = AutoTokenizer.from_pretrained(base_model_id, local_files_only=True)
    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(base_model_id)

if config_path.exists():
    # Full merged model was saved in Step 1
    base_model = AutoModelForCausalLM.from_pretrained(str(sft_dir_path), local_files_only=True)
elif adapter_config_path.exists():
    print("No config.json found; detected adapters. Loading base model from local cache and attaching adapters...")
    try:
        base_model = AutoModelForCausalLM.from_pretrained(base_model_id, local_files_only=True)
    except Exception:
        base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
    from peft import PeftModel
    base_model = PeftModel.from_pretrained(base_model, str(sft_dir_path))
else:
    raise FileNotFoundError(
        "SFT directory does not contain a full model or PEFT adapters.\n"
        f"Missing both: {config_path} and {adapter_config_path}.\n"
        "Re-run Step 1 and ensure the model is saved (either merged full model or adapters)."
    )

# Ensure pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    base_model.config.pad_token_id = base_model.config.eos_token_id

base_model.eval()

with open(reward_data_path, 'r', encoding='utf-8') as f:
    raw = json.load(f)

# Create dataset of prompt+response and label
examples = []
for item in raw:
    text = f"<|user|> {item['input']} <|end|>\n<|assistant|> {item['output']} <|end|>"
    examples.append({
        "text": text,
        "label": float(item["strictness_score"])
    })

hf_ds = Dataset.from_list(examples)

print("Tokenizing...")
hf_ds = hf_ds.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=512), batched=True)
hf_ds.set_format(type='torch', columns=["input_ids", "attention_mask", "label"])

Loading SFT model for reward base from: /Users/akshayapsingi/Projects/RL-Agents/FineTuneLLms/StrictBot/strictbot_sft_model
No config.json found; detected adapters. Loading base model from local cache and attaching adapters...
Tokenizing...


Map: 100%|██████████| 278/278 [00:00<00:00, 8281.43 examples/s]


## Train Reward Head (Regression)



In [3]:
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
import os

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Device:", device)

hidden_size = getattr(base_model.config, "hidden_size", None) or getattr(base_model.config, "n_embd", 768)

class RMHead(nn.Module):
    def __init__(self, in_features: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, 512), nn.ReLU(), nn.Dropout(0.1),
            nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.1),
            nn.Linear(256, 1), nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x).squeeze(-1)

class RewardModel(nn.Module):
    def __init__(self, base_lm: AutoModelForCausalLM, in_features: int):
        super().__init__()
        self.base = base_lm
        self.head = RMHead(in_features)
        for p in self.base.parameters():
            p.requires_grad = False
    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.base(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        last_hidden = outputs.hidden_states[-1]
        if attention_mask is None:
            seq_len = input_ids.new_full((input_ids.size(0),), input_ids.size(1)-1)
        else:
            seq_len = attention_mask.sum(dim=1) - 1
        pooled = last_hidden[torch.arange(last_hidden.size(0)), seq_len]
        scores = self.head(pooled)
        loss = None
        if labels is not None:
            loss = F.mse_loss(scores, labels.float())
        return {"loss": loss, "reward_scores": scores}

rm = RewardModel(base_model.to(device), hidden_size).to(device)

# Simple train/val split
split = int(0.9 * len(hf_ds))
train_ds = hf_ds.select(range(split))
val_ds = hf_ds.select(range(split, len(hf_ds)))

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=16)

optimizer = AdamW(rm.parameters(), lr=1e-4)
num_epochs = 2
best_val = float('inf')

for epoch in range(num_epochs):
    rm.train()
    total = 0.0
    for batch in tqdm(train_loader, desc=f"RM Train Epoch {epoch+1}/{num_epochs}"):
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        out = rm(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"]) 
        loss = out["loss"]
        loss.backward()
        optimizer.step()
        total += loss.item()
    avg_train = total / max(1, len(train_loader))

    # Eval
    rm.eval(); vtotal = 0.0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            out = rm(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"]) 
            vtotal += out["loss"].item()
    avg_val = vtotal / max(1, len(val_loader))
    print(f"Epoch {epoch+1}: train_mse={avg_train:.4f} val_mse={avg_val:.4f}")
    if avg_val < best_val:
        best_val = avg_val
        os.makedirs("strictbot_reward_model", exist_ok=True)
        torch.save(rm.state_dict(), os.path.join("strictbot_reward_model", "rm.pt"))
        with open(os.path.join("strictbot_reward_model", "meta.json"), "w") as f:
            f.write(json.dumps({"base": str(sft_dir), "hidden_size": hidden_size}, indent=2))
        tokenizer.save_pretrained("strictbot_reward_model")
        print("Saved best reward model.")

# Quick check on a sample
sample = hf_ds[0]
with torch.no_grad():
    out = rm(input_ids=sample["input_ids"].unsqueeze(0).to(device), attention_mask=sample["attention_mask"].unsqueeze(0).to(device))
print("Sample score:", out["reward_scores"].item())

Device: mps


RM Train Epoch 1/2: 100%|██████████| 32/32 [01:37<00:00,  3.03s/it]


Epoch 1: train_mse=0.0378 val_mse=0.0076
Saved best reward model.


RM Train Epoch 2/2: 100%|██████████| 32/32 [01:39<00:00,  3.11s/it]


Epoch 2: train_mse=0.0093 val_mse=0.0048
Saved best reward model.
Sample score: 0.24470818042755127
