In [None]:

pip install -q transformers datasets accelerate evaluate

In [1]:
import os
import math
import pandas as pd
from datasets import Dataset
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from accelerate import Accelerator

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [3]:
CSV_PATH = "instruction_following_dataset.csv"   # your CSV
MODEL_NAME = "microsoft/deberta-v3-base"
OUTPUT_DIR = "judge1_reward_model_alt"
MAX_LENGTH = 512
BATCH_SIZE = 8             # reduce if OOM
NUM_EPOCHS = 3
LEARNING_RATE = 2e-5
WARMUP_STEPS = 500         # tune

os.makedirs(OUTPUT_DIR, exist_ok=True)

accelerator = Accelerator(
    gradient_accumulation_steps=4,  # accumulate instead of large batch
    mixed_precision="fp16"
)

DEVICE = accelerator.device

In [6]:
df = pd.read_csv('/kaggle/input/instruction-following-dataset/instruction_following_dataset.csv').sample(n=100000, random_state=42)
print(df.shape)
hf_ds = Dataset.from_pandas(df)

(100000, 3)


In [7]:
def build_pair_texts(examples):
    prompts = examples["prompt"]
    chosen = examples["chosen_response"]
    rejected = examples["rejected_response"]
    chosen_texts = [f"Question: {p}\n\nAnswer: {c}" for p,c in zip(prompts, chosen)]
    rejected_texts = [f"Question: {p}\n\nAnswer: {r}" for p,r in zip(prompts, rejected)]
    return {"chosen_text": chosen_texts, "rejected_text": rejected_texts}

print("Creating pairwise texts...")
hf_ds = hf_ds.map(build_pair_texts, batched=True, remove_columns=[c for c in hf_ds.column_names if c not in []])


Creating pairwise texts...


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

2025-09-13 17:24:02.528541: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757784242.709718      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757784242.771049      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
class PairwiseTorchDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset):
        self.ds = hf_dataset
    def __len__(self):
        return len(self.ds)
    def __getitem__(self, idx):
        item = self.ds[int(idx)]
        return {"chosen_text": item["chosen_text"], "rejected_text": item["rejected_text"]}

def collate_fn(batch):
    chosen_texts = [b["chosen_text"] for b in batch]
    rejected_texts = [b["rejected_text"] for b in batch]
    chosen_enc = tokenizer(chosen_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="pt")
    rejected_enc = tokenizer(rejected_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="pt")
    return {
        "chosen_input_ids": chosen_enc["input_ids"],
        "chosen_attention_mask": chosen_enc["attention_mask"],
        "rejected_input_ids": rejected_enc["input_ids"],
        "rejected_attention_mask": rejected_enc["attention_mask"],
    }

torch_ds = PairwiseTorchDataset(hf_ds)
dataloader = DataLoader(
    torch_ds, batch_size=BATCH_SIZE, shuffle=True,
    collate_fn=collate_fn, num_workers=0, pin_memory=True
)


In [10]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(dataloader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=min(WARMUP_STEPS, total_steps),
    num_training_steps=total_steps
)

model, optimizer, dataloader, scheduler = accelerator.prepare(
    model, optimizer, dataloader, scheduler
)

In [11]:
global_step = 0
model.train()

for epoch in range(NUM_EPOCHS):
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}", disable=not accelerator.is_main_process)
    for step, batch in enumerate(pbar):
        optimizer.zero_grad()

        chosen_inputs = {
            "input_ids": batch["chosen_input_ids"],
            "attention_mask": batch["chosen_attention_mask"],
        }
        rejected_inputs = {
            "input_ids": batch["rejected_input_ids"],
            "attention_mask": batch["rejected_attention_mask"],
        }

        with accelerator.autocast():
            logits_chosen = model(**chosen_inputs).logits.squeeze(-1)     # (B,)
            logits_rejected = model(**rejected_inputs).logits.squeeze(-1) # (B,)
            diff = logits_chosen - logits_rejected
            loss = -(torch.log(torch.sigmoid(diff) + 1e-12)).mean()

        accelerator.backward(loss)
        optimizer.step()
        scheduler.step()

        global_step += 1
        if accelerator.is_main_process and step % 50 == 0:
            pbar.set_postfix({"loss": f"{loss.item():.4f}", "step": global_step})

    # ---- Save checkpoint at end of epoch ----
    if accelerator.is_main_process:
        save_path = os.path.join(OUTPUT_DIR, f"checkpoint_epoch_{epoch+1}")
        accelerator.unwrap_model(model).save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        accelerator.save_state(save_path)
        print(f"Checkpoint saved to {save_path}")

# -----------------------------
# FINAL SAVE + ZIP FOR DOWNLOAD
# -----------------------------
if accelerator.is_main_process:
    final_model_path = os.path.join(OUTPUT_DIR, "final_model")
    accelerator.unwrap_model(model).save_pretrained(final_model_path)
    tokenizer.save_pretrained(final_model_path)
    accelerator.save_state(final_model_path)
    print(f"Final model saved to {final_model_path}")

    # Zip the model for easy download (Kaggle/Colab)
    shutil.make_archive("trained_model", 'zip', final_model_path)
    print("Zipped model to trained_model.zip")

print("✅ Training complete")

Epoch 1/3:   0%|          | 0/12500 [00:00<?, ?it/s]

Checkpoint saved to judge1_reward_model_alt/checkpoint_epoch_1


Epoch 2/3:   0%|          | 0/12500 [00:00<?, ?it/s]

Checkpoint saved to judge1_reward_model_alt/checkpoint_epoch_2


Epoch 3/3:   0%|          | 0/12500 [00:00<?, ?it/s]

Checkpoint saved to judge1_reward_model_alt/checkpoint_epoch_3
Final model saved to judge1_reward_model_alt/final_model


NameError: name 'shutil' is not defined

In [13]:
import shutil

# Choose final model or last checkpoint
to_zip = final_model_path if os.path.exists(final_model_path) else latest_checkpoint
shutil.make_archive("trained_model", 'zip', to_zip)
print("Zipped model to trained_model.zip")


Zipped model to trained_model.zip


In [14]:

print("Saving model to", OUTPUT_DIR)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Done.")

Saving model to judge1_reward_model_alt
Done.


In [None]:
OUTPUT_DIR ="/kaggle/working/"

In [None]:
from IPython.display import FileLink
FileLink()