In [None]:
!pip install torch transformers peft accelerate datasets tqdm scikit-learn


In [None]:
import os
import math
import random
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from transformers import AutoTokenizer, AutoModel
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

DATA_DIR = "lmsys-chatbot-arena"  # based on your screenshot
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
TEST_CSV  = os.path.join(DATA_DIR, "test.csv")

BASE_MODEL_NAME = "roberta-large"   # can downgrade to "roberta-base" if VRAM explodes
MAX_LEN = 768                       # keep this reasonable for your GPU
BATCH_SIZE = 2                      # we use grad accumulation instead of big batch
GRAD_ACCUM_STEPS = 8                # effective batch_size ~= 16
LR = 2e-5
EPOCHS = 2
SEED = 42
N_FOLDS = 5
OUT_DIR = "reward_teacher_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)


In [None]:
df = pd.read_csv(TRAIN_CSV)

# sanity: make one label column 0/1/2
def row_label(r):
    if r["winner_model_a"] == 1:
        return 0  # A preferred
    elif r["winner_model_b"] == 1:
        return 1  # B preferred
    else:
        return 2  # tie

df["label"] = df.apply(row_label, axis=1)

# We also keep lengths if you want analysis later
df["len_a"] = df["response_a"].astype(str).str.len()
df["len_b"] = df["response_b"].astype(str).str.len()

# groups for CV: group by prompt (hash to avoid giant strings)
df["prompt_group"] = df["prompt"].astype(str).apply(lambda x: hash(x) % (10**9))
print(df[["label","len_a","len_b"]].head())
print("class counts:\n", df["label"].value_counts())


In [None]:
class PairwisePrefDataset(Dataset):
    def __init__(self, df_fold, tokenizer, max_len=512):
        self.df = df_fold.reset_index(drop=True)
        self.tok = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def encode_pair(self, prompt, resp):
        # We'll just concat prompt + </s> + resp style for RoBERTa:
        text = f"Prompt: {prompt}\nAnswer:\n{resp}"
        return self.tok(
            text,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_tensors="pt"
        )

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc_a = self.encode_pair(row["prompt"], row["response_a"])
        enc_b = self.encode_pair(row["prompt"], row["response_b"])
        label = row["label"]
        return {
            "input_ids_a": enc_a["input_ids"].squeeze(0),
            "attention_mask_a": enc_a["attention_mask"].squeeze(0),
            "input_ids_b": enc_b["input_ids"].squeeze(0),
            "attention_mask_b": enc_b["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
            "row_id": torch.tensor(row["id"], dtype=torch.long),
        }


In [None]:
from transformers import AutoModel
import torch.nn as nn
from peft import LoraConfig, get_peft_model

class RewardScorer(nn.Module):
    def __init__(self, base_model_name):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(base_model_name)
        hidden_size = self.backbone.config.hidden_size
        self.score_head = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        # take CLS token embedding (RoBERTa: first token <s>)
        cls_state = out.last_hidden_state[:, 0, :]  # [batch, hidden]
        score = self.score_head(cls_state)          # [batch, 1]
        return score.squeeze(-1)                    # [batch]

# build model + apply LoRA
def build_lora_reward_model(base_model_name):
    base = RewardScorer(base_model_name)

    # we apply LoRA to all linear layers in the backbone for efficiency
    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["query", "key", "value", "dense", "out_proj"],  # broad but safe guess for RoBERTa MHA/FFN
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS"  # close enough; it's scoring
    )
    lora_model = get_peft_model(base, peft_config)
    return lora_model

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
model = build_lora_reward_model(BASE_MODEL_NAME).to(DEVICE)

print("Model ready on", DEVICE)


In [None]:
def pref_loss(scores_a, scores_b, labels, margin=0.5, tie_band=0.2):
    # scores_a, scores_b: [batch]
    # labels: [batch] in {0,1,2}
    loss_list = []

    # A wins -> sA >= sB + margin
    mask_a = (labels == 0)
    if mask_a.any():
        diff = scores_b[mask_a] + margin - scores_a[mask_a]
        # want diff <= 0
        loss_a = torch.clamp(diff, min=0).mean()
        loss_list.append(loss_a)

    # B wins -> sB >= sA + margin
    mask_b = (labels == 1)
    if mask_b.any():
        diff = scores_a[mask_b] + margin - scores_b[mask_b]
        loss_b = torch.clamp(diff, min=0).mean()
        loss_list.append(loss_b)

    # Tie -> |sA - sB| <= tie_band
    mask_t = (labels == 2)
    if mask_t.any():
        diff = torch.abs(scores_a[mask_t] - scores_b[mask_t]) - tie_band
        loss_t = torch.clamp(diff, min=0).mean()
        loss_list.append(loss_t)

    if len(loss_list) == 0:
        return torch.tensor(0.0, device=scores_a.device, requires_grad=True)
    return torch.stack(loss_list).mean()


In [None]:
gkf = GroupKFold(n_splits=N_FOLDS)
folds = []
for fold_id, (tr_idx, va_idx) in enumerate(gkf.split(df, groups=df["prompt_group"])):
    df.loc[va_idx, "fold"] = fold_id
df["fold"] = df["fold"].astype(int)

def train_one_fold(fold_id):
    print(f"\n===== FOLD {fold_id} =====")
    df_tr = df[df["fold"] != fold_id].reset_index(drop=True)
    df_va = df[df["fold"] == fold_id].reset_index(drop=True)

    train_ds = PairwisePrefDataset(df_tr, tokenizer, max_len=MAX_LEN)
    va_ds    = PairwisePrefDataset(df_va, tokenizer, max_len=MAX_LEN)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    va_loader    = DataLoader(va_ds, batch_size=BATCH_SIZE, shuffle=False)

    model = build_lora_reward_model(BASE_MODEL_NAME).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

    # simple linear warmup + cosine cooldown
    total_steps = math.ceil(len(train_loader) / GRAD_ACCUM_STEPS) * EPOCHS
    warmup_steps = min(100, total_steps // 10)

    def lr_lambda(step):
        if step < warmup_steps:
            return step / max(1, warmup_steps)
        # cosine decay after warmup
        progress = (step - warmup_steps) / max(1, (total_steps - warmup_steps))
        return 0.5 * (1 + math.cos(math.pi * progress))

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

    global_step = 0
    model.train()
    for epoch in range(EPOCHS):
        optimizer.zero_grad()
        running_loss = 0.0
        for step, batch in enumerate(tqdm(train_loader, desc=f"Fold {fold_id} Epoch {epoch}")):
            input_ids_a = batch["input_ids_a"].to(DEVICE)
            attn_a      = batch["attention_mask_a"].to(DEVICE)
            input_ids_b = batch["input_ids_b"].to(DEVICE)
            attn_b      = batch["attention_mask_b"].to(DEVICE)
            labels      = batch["label"].to(DEVICE)

            scores_a = model(input_ids_a, attn_a)
            scores_b = model(input_ids_b, attn_b)

            loss = pref_loss(scores_a, scores_b, labels)
            loss = loss / GRAD_ACCUM_STEPS
            loss.backward()
            running_loss += loss.item()

            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1

        print(f"Epoch {epoch} avg loss (divided): {running_loss/len(train_loader):.4f}")

    # after training, eval on val fold to get a feel
    model.eval()
    all_scores = []
    all_labels = []
    all_ids    = []
    with torch.no_grad():
        for batch in tqdm(va_loader, desc=f"Eval fold {fold_id}"):
            input_ids_a = batch["input_ids_a"].to(DEVICE)
            attn_a      = batch["attention_mask_a"].to(DEVICE)
            input_ids_b = batch["input_ids_b"].to(DEVICE)
            attn_b      = batch["attention_mask_b"].to(DEVICE)
            labels      = batch["label"].to(DEVICE)
            row_ids     = batch["row_id"].cpu().numpy()

            scores_a = model(input_ids_a, attn_a).cpu().numpy()
            scores_b = model(input_ids_b, attn_b).cpu().numpy()

            all_scores.append(np.stack([scores_a, scores_b], axis=1))  # shape [batch,2]
            all_labels.append(labels.cpu().numpy())
            all_ids.append(row_ids)

    all_scores = np.concatenate(all_scores, axis=0)   # [N_val,2]
    all_labels = np.concatenate(all_labels, axis=0)   # [N_val]
    all_ids    = np.concatenate(all_ids, axis=0)      # [N_val]

    # save checkpoint
    fold_dir = os.path.join(OUT_DIR, f"fold_{fold_id}")
    os.makedirs(fold_dir, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(fold_dir, "reward_teacher_roberta.pt"))

    # we also save the val fold scores for later calibration/logit building
    oof_df = pd.DataFrame({
        "id": all_ids,
        "score_a": all_scores[:,0],
        "score_b": all_scores[:,1],
        "label": all_labels,
        "fold": fold_id
    })
    oof_df.to_csv(os.path.join(fold_dir, "oof_scores.csv"), index=False)

    return oof_df

all_oof = []
for f in range(N_FOLDS):
    oof_df = train_one_fold(f)
    all_oof.append(oof_df)

all_oof = pd.concat(all_oof, ignore_index=True)
all_oof.to_csv(os.path.join(OUT_DIR, "all_oof_scores.csv"), index=False)
print("Saved all OOF scores to", os.path.join(OUT_DIR, "all_oof_scores.csv"))


In [None]:
import scipy.optimize as opt
import torch.nn.functional as F

# map label int -> onehot [A,B,Tie]
def label_to_onehot(y):
    out = np.zeros((len(y),3), dtype=np.float32)
    for i,lab in enumerate(y):
        out[i, lab] = 1.0
    return out

oof = pd.read_csv(os.path.join(OUT_DIR, "all_oof_scores.csv"))

y_true = label_to_onehot(oof["label"].values)
sa = oof["score_a"].values
sb = oof["score_b"].values

def probs_from_scores(sa, sb, temp, tie_alpha):
    # Bradley-Terry-ish:
    # pA_raw = exp(sa/temp)
    # pB_raw = exp(sb/temp)
    # tie_raw = exp(-abs(sa-sb)*tie_alpha)
    pA_raw = np.exp(sa / temp)
    pB_raw = np.exp(sb / temp)
    tie_raw = np.exp(-np.abs(sa - sb) * tie_alpha)

    denom = pA_raw + pB_raw + tie_raw + 1e-9
    pA = pA_raw / denom
    pB = pB_raw / denom
    pT = tie_raw / denom
    return np.stack([pA,pB,pT], axis=1)

def logloss(params):
    temp = np.exp(params[0])          # >0
    tie_alpha = np.exp(params[1])     # >0
    pred = probs_from_scores(sa, sb, temp, tie_alpha)
    # clip for stability
    pred = np.clip(pred, 1e-7, 1-1e-7)
    return -np.mean((y_true * np.log(pred)).sum(axis=1))

res = opt.minimize(logloss, x0=[0.0, 0.0], method="Nelder-Mead")
print("opt result:", res)

best_temp = math.exp(res.x[0])
best_tie_alpha = math.exp(res.x[1])
print("best_temp =", best_temp, " best_tie_alpha =", best_tie_alpha)

# save calibration params
calib_path = os.path.join(OUT_DIR, "calibration_params.json")
import json
with open(calib_path, "w") as f:
    json.dump({"temp": best_temp, "tie_alpha": best_tie_alpha}, f)
print("Saved", calib_path)


In [None]:
import json
from glob import glob

with open(os.path.join(OUT_DIR, "calibration_params.json")) as f:
    calib = json.load(f)
TEMP = calib["temp"]
TIE_ALPHA = calib["tie_alpha"]

def get_model_for_fold(fold_id):
    m = build_lora_reward_model(BASE_MODEL_NAME).to(DEVICE)
    state_path = os.path.join(OUT_DIR, f"fold_{fold_id}", "reward_teacher_roberta.pt")
    m.load_state_dict(torch.load(state_path, map_location=DEVICE))
    m.eval()
    return m

def score_pairs_df(df_any, max_len=MAX_LEN):
    ds = PairwisePrefDataset(df_any, tokenizer, max_len=max_len)
    loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=False)
    fold_scores_a = []
    fold_scores_b = []
    with torch.no_grad():
        for fold_id in range(N_FOLDS):
            model_f = get_model_for_fold(fold_id)
            sa_list, sb_list = [], []
            for batch in tqdm(loader, desc=f"Infer fold {fold_id}"):
                input_ids_a = batch["input_ids_a"].to(DEVICE)
                attn_a      = batch["attention_mask_a"].to(DEVICE)
                input_ids_b = batch["input_ids_b"].to(DEVICE)
                attn_b      = batch["attention_mask_b"].to(DEVICE)

                scores_a = model_f(input_ids_a, attn_a).cpu().numpy()
                scores_b = model_f(input_ids_b, attn_b).cpu().numpy()
                sa_list.append(scores_a)
                sb_list.append(scores_b)

            sa_all = np.concatenate(sa_list)
            sb_all = np.concatenate(sb_list)
            fold_scores_a.append(sa_all)
            fold_scores_b.append(sb_all)

    # average across folds
    sa_mean = np.mean(np.stack(fold_scores_a, axis=0), axis=0)
    sb_mean = np.mean(np.stack(fold_scores_b, axis=0), axis=0)
    return sa_mean, sb_mean

# score train
sa_train, sb_train = score_pairs_df(df)
# score test
df_test = pd.read_csv(TEST_CSV)
df_test["label"] = -1  # dummy
df_test["prompt_group"] = df_test["prompt"].astype(str).apply(lambda x: hash(x) % (10**9))
sa_test, sb_test = score_pairs_df(df_test)

def probs_from_scores_numpy(sa, sb, temp, tie_alpha):
    pA_raw = np.exp(sa / temp)
    pB_raw = np.exp(sb / temp)
    tie_raw = np.exp(-np.abs(sa - sb) * tie_alpha)
    denom = pA_raw + pB_raw + tie_raw + 1e-9
    pA = pA_raw / denom
    pB = pB_raw / denom
    pT = tie_raw / denom
    return np.stack([pA,pB,pT], axis=1)

probs_train = probs_from_scores_numpy(sa_train, sb_train, TEMP, TIE_ALPHA)
probs_test  = probs_from_scores_numpy(sa_test,  sb_test,  TEMP, TIE_ALPHA)

teacher_train_out = pd.DataFrame({
    "id": df["id"].values,
    "pA": probs_train[:,0],
    "pB": probs_train[:,1],
    "pTie": probs_train[:,2],
    "label": df["label"].values
})
teacher_test_out = pd.DataFrame({
    "id": df_test["id"].values,
    "pA": probs_test[:,0],
    "pB": probs_test[:,1],
    "pTie": probs_test[:,2],
})

teacher_train_out.to_csv(os.path.join(OUT_DIR, "teacher_logits_train.csv"), index=False)
teacher_test_out.to_csv(os.path.join(OUT_DIR, "teacher_logits_test.csv"), index=False)

print("Saved distillation targets:")
print(" -", os.path.join(OUT_DIR, "teacher_logits_train.csv"))
print(" -", os.path.join(OUT_DIR, "teacher_logits_test.csv"))
