##Load the SFT Model and Add Scalar Reward Head

In [1]:
!pip install transformers datasets accelerate trl peft tensorboard


Collecting trl
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.24.0-py3-none-any.whl (423 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.24.0


In [2]:
import os
import math
import json
from pathlib import Path
from typing import Dict, Any
import glob
import shutil

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_

from peft import LoraConfig, get_peft_model, PeftModel

from torch.utils.tensorboard import SummaryWriter

from transformers import get_linear_schedule_with_warmup

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
workspace = 'RewardModelTry2'

In [5]:
import os
os.chdir(f"/content/drive/My Drive/{workspace}")
print("Current working dir:", os.getcwd())


Current working dir: /content/drive/My Drive/RewardModelTry2


In [7]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, PeftModel

base_model = "gpt2"
adapter_path = "ArnavM3434/gpt2-alpaca-second-try"
model = AutoModelForCausalLM.from_pretrained(adapter_path)

In [21]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [22]:
def inspect_trainable_params(model):
    total = 0
    trainable = 0
    details = []
    for n, p in model.named_parameters():
        total += p.numel()
        if p.requires_grad:
            trainable += p.numel()
            details.append(n)
    print(f"Trainable params: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")
    print("Example trainable params:", details[:20])
    return details

In [23]:
trainable_before = inspect_trainable_params(model)

Trainable params: 0 / 129,373,984 (0.00%)
Example trainable params: []


In [24]:
# Make sure LoRA parameters are trainable
for name, param in model.named_parameters():
    if "lora_" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False  # freeze base model

In [25]:
trainable_after = inspect_trainable_params(model)

Trainable params: 4,934,176 / 129,373,984 (3.81%)
Example trainable params: ['transformer.wte.lora_embedding_A.default', 'transformer.wte.lora_embedding_B.default', 'transformer.wpe.lora_embedding_A.default', 'transformer.wpe.lora_embedding_B.default', 'transformer.h.0.attn.c_attn.lora_A.default.weight', 'transformer.h.0.attn.c_attn.lora_B.default.weight', 'transformer.h.0.attn.c_proj.lora_A.default.weight', 'transformer.h.0.attn.c_proj.lora_B.default.weight', 'transformer.h.0.mlp.c_proj.lora_A.default.weight', 'transformer.h.0.mlp.c_proj.lora_B.default.weight', 'transformer.h.1.attn.c_attn.lora_A.default.weight', 'transformer.h.1.attn.c_attn.lora_B.default.weight', 'transformer.h.1.attn.c_proj.lora_A.default.weight', 'transformer.h.1.attn.c_proj.lora_B.default.weight', 'transformer.h.1.mlp.c_proj.lora_A.default.weight', 'transformer.h.1.mlp.c_proj.lora_B.default.weight', 'transformer.h.2.attn.c_attn.lora_A.default.weight', 'transformer.h.2.attn.c_attn.lora_B.default.weight', 'transfor

Add the Reward Model Head

In [26]:
model.reward_head = nn.Linear(model.config.hidden_size, 1)

In [27]:
for name, param in model.named_parameters():
    if "lora_" in name or "reward_head" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [28]:
inspect_trainable_params(model)

Trainable params: 4,934,945 / 129,374,753 (3.81%)
Example trainable params: ['transformer.wte.lora_embedding_A.default', 'transformer.wte.lora_embedding_B.default', 'transformer.wpe.lora_embedding_A.default', 'transformer.wpe.lora_embedding_B.default', 'transformer.h.0.attn.c_attn.lora_A.default.weight', 'transformer.h.0.attn.c_attn.lora_B.default.weight', 'transformer.h.0.attn.c_proj.lora_A.default.weight', 'transformer.h.0.attn.c_proj.lora_B.default.weight', 'transformer.h.0.mlp.c_proj.lora_A.default.weight', 'transformer.h.0.mlp.c_proj.lora_B.default.weight', 'transformer.h.1.attn.c_attn.lora_A.default.weight', 'transformer.h.1.attn.c_attn.lora_B.default.weight', 'transformer.h.1.attn.c_proj.lora_A.default.weight', 'transformer.h.1.attn.c_proj.lora_B.default.weight', 'transformer.h.1.mlp.c_proj.lora_A.default.weight', 'transformer.h.1.mlp.c_proj.lora_B.default.weight', 'transformer.h.2.attn.c_attn.lora_A.default.weight', 'transformer.h.2.attn.c_attn.lora_B.default.weight', 'transfor

['transformer.wte.lora_embedding_A.default',
 'transformer.wte.lora_embedding_B.default',
 'transformer.wpe.lora_embedding_A.default',
 'transformer.wpe.lora_embedding_B.default',
 'transformer.h.0.attn.c_attn.lora_A.default.weight',
 'transformer.h.0.attn.c_attn.lora_B.default.weight',
 'transformer.h.0.attn.c_proj.lora_A.default.weight',
 'transformer.h.0.attn.c_proj.lora_B.default.weight',
 'transformer.h.0.mlp.c_proj.lora_A.default.weight',
 'transformer.h.0.mlp.c_proj.lora_B.default.weight',
 'transformer.h.1.attn.c_attn.lora_A.default.weight',
 'transformer.h.1.attn.c_attn.lora_B.default.weight',
 'transformer.h.1.attn.c_proj.lora_A.default.weight',
 'transformer.h.1.attn.c_proj.lora_B.default.weight',
 'transformer.h.1.mlp.c_proj.lora_A.default.weight',
 'transformer.h.1.mlp.c_proj.lora_B.default.weight',
 'transformer.h.2.attn.c_attn.lora_A.default.weight',
 'transformer.h.2.attn.c_attn.lora_B.default.weight',
 'transformer.h.2.attn.c_proj.lora_A.default.weight',
 'transformer.

##Preference Dataset

Use Dahoas/rm-static

In [15]:
from datasets import load_dataset
dataset = load_dataset("Dahoas/rm-static")

README.md:   0%|          | 0.00/530 [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/926 [00:00<?, ?B/s]

data/train-00000-of-00001-2a1df75c6bce91(…):   0%|          | 0.00/68.4M [00:00<?, ?B/s]

data/test-00000-of-00001-8c7c51afc6d4598(…):   0%|          | 0.00/4.61M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/76256 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5103 [00:00<?, ? examples/s]

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 76256
    })
    test: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 5103
    })
})

In [17]:
from datasets import DatasetDict
dataset = DatasetDict({
    'train': dataset['train'],
    'validation': dataset['test']
})

dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 76256
    })
    validation: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 5103
    })
})

In [18]:
train_ds = dataset['train']
train_ds[0]

{'prompt': '\n\nHuman: Can you describe the steps to clean fingerprints and smudges from a laptop screen\n\nAssistant: Yes, certainly. To clean your screen, you first need to use a microfiber cloth or soft, damp cloth to gently wipe down the surface of the screen. Next, you’ll want to grab a soft, lint-free, microfiber cleaning cloth and gently rub it back and forth across the screen to remove fingerprints and smudges.\n\nHuman: Can I spray isopropyl alcohol onto the cloth and clean it that way?\n\nAssistant:',
 'response': ' Yes, you can do that to help the cloth pick up even more dirt from the screen. Be sure to always use a clean, soft cloth, not a piece of scratchy, roughened, or textured material, and make sure it’s lint-free.',
 'chosen': ' Yes, you can do that to help the cloth pick up even more dirt from the screen. Be sure to always use a clean, soft cloth, not a piece of scratchy, roughened, or textured material, and make sure it’s lint-free.',
 'rejected': ' Yes, you can spr

In [19]:
train_ds[1]

{'prompt': '\n\nHuman: What are some foods that are good for diabetics?\n\nAssistant: To be honest, some of these are better than others, and they’re a little more like opinions than facts. For example, many of the diets say to limit vegetables with high sugar content, and there’s some debate on the subject, as far as how much of these vegetables are actually bad for diabetics.\n\nHuman: Okay, any other advice?\n\nAssistant:',
 'response': ' What exactly are you asking? There’s a lot of different kinds of diabetic diets. I could try to recommend you some specific foods and recipes. I could help you look up any of the foods, and I could find recipes for them.',
 'chosen': ' What exactly are you asking? There’s a lot of different kinds of diabetic diets. I could try to recommend you some specific foods and recipes. I could help you look up any of the foods, and I could find recipes for them.',
 'rejected': ' Sure, we’ve got information on common mistakes that diabetic patients make with 

In [20]:
val_ds = dataset['validation']
val_ds[0]

{'prompt': "\n\nHuman: I am trying to write a fairy tale. What is the most popular plot?\n\nAssistant: The most popular plot might be “a princess goes to a faraway land, falls in love, and has a magic genie that grants her wishes”.  We can find plenty of examples of this if we search for fairy tales using the search engine Google.\n\nHuman: You can't look anything up on google.\n\nAssistant: OK, so let’s think about some other popular fairy tales.\n\nHuman: Do you like the plot of Hansel and Gretel?\n\nAssistant: Not particularly.\n\nHuman: what don't you like?\n\nAssistant: I feel like the children do very stupid things.\n\nHuman: Should I include a witch on my story?\n\nAssistant: I think a witch is a fairly common fairy tale element, so yes.  However, what kind of plotline would you like to pursue?\n\nHuman: I was thinking I might pursue a feminist plot for a more modern fairytale.\n\nAssistant: I would be happy to help you make your story a more feminist one.  Are you writing a new

In [21]:
import re

def keep_last_turn(example): #Want just single turn completions, since SFT model was trained on that distribution
    prompt = example["prompt"]

    human_turns = [m.start() for m in re.finditer(r'\bHuman:', prompt)]
    if not human_turns:
        return example

    last_human_start = human_turns[-1]
    trimmed_prompt = prompt[last_human_start:].strip()

    trimmed_prompt = re.sub(r'\s*\n+\s*Assistant:', ' Assistant:', trimmed_prompt)

    if not trimmed_prompt.endswith("Assistant:"):
        trimmed_prompt += " Assistant:"

    trimmed_prompt = re.sub(r'\s+', ' ', trimmed_prompt).strip()

    example["prompt"] = trimmed_prompt
    return example


train_ds = train_ds.map(
    keep_last_turn
)

val_ds = val_ds.map(
    keep_last_turn
)




Map:   0%|          | 0/76256 [00:00<?, ? examples/s]

Map:   0%|          | 0/5103 [00:00<?, ? examples/s]

In [22]:
train_ds[0]

{'prompt': 'Human: Can I spray isopropyl alcohol onto the cloth and clean it that way? Assistant:',
 'response': ' Yes, you can do that to help the cloth pick up even more dirt from the screen. Be sure to always use a clean, soft cloth, not a piece of scratchy, roughened, or textured material, and make sure it’s lint-free.',
 'chosen': ' Yes, you can do that to help the cloth pick up even more dirt from the screen. Be sure to always use a clean, soft cloth, not a piece of scratchy, roughened, or textured material, and make sure it’s lint-free.',
 'rejected': ' Yes, you can spray it directly onto the cloth.'}

In [23]:
val_ds[0]

{'prompt': 'Human: The witch could be trying to make the prince give up his traditionally female characteristics and turn him into a bad person. Assistant:',
 'response': ' This sounds like a really interesting modern retelling of the story!',
 'chosen': ' This sounds like a really interesting modern retelling of the story!',
 'rejected': ' And the prince and the princess both decide that they are more powerful together than apart?'}

In [24]:
max_length = 1024

def tokenize_example(example):
    chosen_text = example["prompt"] + example["chosen"]
    rejected_text = example["prompt"] + example["rejected"]

    chosen = tokenizer(
        chosen_text,
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    rejected = tokenizer(
        rejected_text,
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )

    return {
        "chosen_input_ids": chosen["input_ids"],
        "chosen_attention_mask": chosen["attention_mask"],
        "rejected_input_ids": rejected["input_ids"],
        "rejected_attention_mask": rejected["attention_mask"],
    }

tokenized_train_ds = train_ds.map(
    tokenize_example,
    remove_columns=train_ds.column_names,
)

tokenized_val_ds = val_ds.map(
    tokenize_example,
    remove_columns=val_ds.column_names,
)

Map:   0%|          | 0/76256 [00:00<?, ? examples/s]

Map:   0%|          | 0/5103 [00:00<?, ? examples/s]

In [25]:
print(tokenized_train_ds.column_names)

['chosen_input_ids', 'chosen_attention_mask', 'rejected_input_ids', 'rejected_attention_mask']


##Training the Reward Model

In [45]:
train_batch_size = 4
gradient_accumulation_steps = 4
num_epochs = 3
lr = 9e-6 #from InstructGPT paper
weight_decay = 0.0
warmup_steps = 100
save_dir = "./reward-model-accel"
checkpoint_prefix = "checkpoint"
save_steps = 200
logging_steps = 50
eval_steps = 200
fp16 = True
num_workers = 2
max_checkpoints = 2

In [46]:
from accelerate import Accelerator
accelerator = Accelerator(
    mixed_precision="fp16",
    gradient_accumulation_steps=gradient_accumulation_steps
)
device = accelerator.device
print("Running on", device)

Running on cuda


In [47]:
def ensure_dir(path):
    Path(path).mkdir(parents=True, exist_ok=True)

def save_training_state(save_dir: str, step: int, accelerator, optimizer, scheduler, scaler):
    save_dir = Path(save_dir)
    ckpt_dir = save_dir / f"{checkpoint_prefix}-{step}"
    ensure_dir(ckpt_dir)

    # Save PEFT adapter weights (preferred)
    model_to_save = accelerator.unwrap_model(model)
    peft_save_dir = ckpt_dir / "adapter"
    model_to_save.save_pretrained(peft_save_dir)

    if hasattr(model, "reward_head"):
        reward_head_path = ckpt_dir / "reward_head.pt"
        torch.save(model.reward_head.state_dict(), reward_head_path)

    # Save optimizer/scheduler/scaler states
    accelerator.save_state(str(ckpt_dir / "acc_state"))

    # Save meta info
    meta = {"step": step}
    (ckpt_dir / "meta.json").write_text(json.dumps(meta))

    print(f"Saved checkpoint to {ckpt_dir}")

    # --- Delete old checkpoints if more than MAX_CHECKPOINTS ---
    all_ckpts = sorted(glob.glob(str(save_dir / f"{checkpoint_prefix}-*")),
                       key=lambda x: Path(x).stat().st_mtime)
    while len(all_ckpts) > max_checkpoints:
        old_ckpt = Path(all_ckpts.pop(0))
        print(f"Deleting old checkpoint: {old_ckpt}")
        shutil.rmtree(old_ckpt)


In [48]:
def collate_fn(batch):
    chosen_input_ids = torch.tensor([b["chosen_input_ids"] for b in batch], dtype=torch.long)
    chosen_attention_mask = torch.tensor([b["chosen_attention_mask"] for b in batch], dtype=torch.long)
    rejected_input_ids = torch.tensor([b["rejected_input_ids"] for b in batch], dtype=torch.long)
    rejected_attention_mask = torch.tensor([b["rejected_attention_mask"] for b in batch], dtype=torch.long)
    return {"chosen_input_ids": chosen_input_ids, "chosen_attention_mask": chosen_attention_mask, "rejected_input_ids": rejected_input_ids, "rejected_attention_mask": rejected_attention_mask}

In [50]:
ensure_dir(save_dir)

In [51]:
writer = SummaryWriter(log_dir=os.path.join(save_dir, "tensorboard"))

In [52]:
model.gradient_checkpointing_enable()

In [53]:
for n, p in model.named_parameters():
    if p.requires_grad:
        p.data = p.data.to(accelerator.device)  # ensure params are on same device

In [54]:
train_dataloader = DataLoader(tokenized_train_ds, shuffle=True, collate_fn=collate_fn,batch_size=train_batch_size, num_workers=num_workers)
eval_dataloader = DataLoader(tokenized_val_ds, shuffle=False, collate_fn=collate_fn, batch_size=train_batch_size, num_workers=num_workers)

In [55]:
no_decay = ["bias", "LayerNorm.weight"]
param_groups = [
        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": weight_decay},
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
optimizer = AdamW(param_groups, lr=lr)

In [56]:
total_train_steps = math.ceil(len(train_dataloader) * num_epochs / gradient_accumulation_steps)

In [57]:
from transformers import get_cosine_schedule_with_warmup

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_train_steps,
    num_cycles=0.5,
)

In [59]:
import torch
from pathlib import Path

def load_full_checkpoint(accelerator, model):
    ckpt_dir = Path("reward-model-accel/checkpoint-1400")
    adapter_dir = ckpt_dir / "adapter"
    reward_head_path = ckpt_dir / "reward_head.pt"
    acc_state_dir = ckpt_dir / "acc_state"

    model = PeftModel.from_pretrained(model, str(adapter_dir))

    if reward_head_path.exists():
        model.reward_head.load_state_dict(torch.load(reward_head_path, map_location="cpu"))
        print("Loaded reward_head state dict")

    accelerator.load_state(str(acc_state_dir))
    print(f"Loaded accelerator state from {acc_state_dir}")

    return model

load_full_checkpoint(accelerator, model)




Loaded reward_head state dict
Loaded accelerator state from reward-model-accel/checkpoint-1400/acc_state


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): lora.Embedding(
          (base_layer): Embedding(50257, 768)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.2, inplace=False)
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 32x50257])
          (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 768x32])
          (lora_magnitude_vector): ModuleDict()
        )
        (wpe): lora.Embedding(
          (base_layer): Embedding(1024, 768)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.2, inplace=False)
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 32

In [60]:
model, optimizer, train_dataloader, eval_dataloader, scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, scheduler
    )

In [61]:
inspect_trainable_params(model)

Trainable params: 0 / 129,374,753 (0.00%)
Example trainable params: []


[]

In [62]:
for name, param in model.named_parameters():
    if "lora_" in name or "reward_head" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [63]:
inspect_trainable_params(model)

Trainable params: 4,934,945 / 129,374,753 (3.81%)
Example trainable params: ['transformer.wte.lora_embedding_A.default', 'transformer.wte.lora_embedding_B.default', 'transformer.wpe.lora_embedding_A.default', 'transformer.wpe.lora_embedding_B.default', 'transformer.h.0.attn.c_attn.lora_A.default.weight', 'transformer.h.0.attn.c_attn.lora_B.default.weight', 'transformer.h.0.attn.c_proj.lora_A.default.weight', 'transformer.h.0.attn.c_proj.lora_B.default.weight', 'transformer.h.0.mlp.c_proj.lora_A.default.weight', 'transformer.h.0.mlp.c_proj.lora_B.default.weight', 'transformer.h.1.attn.c_attn.lora_A.default.weight', 'transformer.h.1.attn.c_attn.lora_B.default.weight', 'transformer.h.1.attn.c_proj.lora_A.default.weight', 'transformer.h.1.attn.c_proj.lora_B.default.weight', 'transformer.h.1.mlp.c_proj.lora_A.default.weight', 'transformer.h.1.mlp.c_proj.lora_B.default.weight', 'transformer.h.2.attn.c_attn.lora_A.default.weight', 'transformer.h.2.attn.c_attn.lora_B.default.weight', 'transfor

['transformer.wte.lora_embedding_A.default',
 'transformer.wte.lora_embedding_B.default',
 'transformer.wpe.lora_embedding_A.default',
 'transformer.wpe.lora_embedding_B.default',
 'transformer.h.0.attn.c_attn.lora_A.default.weight',
 'transformer.h.0.attn.c_attn.lora_B.default.weight',
 'transformer.h.0.attn.c_proj.lora_A.default.weight',
 'transformer.h.0.attn.c_proj.lora_B.default.weight',
 'transformer.h.0.mlp.c_proj.lora_A.default.weight',
 'transformer.h.0.mlp.c_proj.lora_B.default.weight',
 'transformer.h.1.attn.c_attn.lora_A.default.weight',
 'transformer.h.1.attn.c_attn.lora_B.default.weight',
 'transformer.h.1.attn.c_proj.lora_A.default.weight',
 'transformer.h.1.attn.c_proj.lora_B.default.weight',
 'transformer.h.1.mlp.c_proj.lora_A.default.weight',
 'transformer.h.1.mlp.c_proj.lora_B.default.weight',
 'transformer.h.2.attn.c_attn.lora_A.default.weight',
 'transformer.h.2.attn.c_attn.lora_B.default.weight',
 'transformer.h.2.attn.c_proj.lora_A.default.weight',
 'transformer.

In [64]:
starting_step = 1400
scaler = torch.cuda.amp.GradScaler(enabled=(accelerator.state.mixed_precision == "fp16"))

  scaler = torch.cuda.amp.GradScaler(enabled=(accelerator.state.mixed_precision == "fp16"))


In [65]:
print(starting_step)

1400


In [66]:
global_step = starting_step
model.train()
print("***** Running training *****")
print(f"  Num examples = {len(tokenized_train_ds)}")
print(f"  Num Epochs = {num_epochs}")
print(f"  Instantaneous batch size per device = {train_batch_size}")
print(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
print(f"  Total optimization steps = {total_train_steps}")

***** Running training *****
  Num examples = 76256
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 14298


In [None]:
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for step, batch in enumerate(train_dataloader):

        #in case we are resuming training
        accumulated_steps = (epoch * len(train_dataloader) + step) // gradient_accumulation_steps
        if accumulated_steps < starting_step:
            continue

        # batch: dict of tensors
        with accelerator.accumulate(model):
            # forward with amp context
            with torch.cuda.amp.autocast(enabled=accelerator.mixed_precision=="fp16"):
                chosen_outputs = model(
                    input_ids=batch["chosen_input_ids"],
                    attention_mask=batch["chosen_attention_mask"],
                    output_hidden_states=True
                )
                rejected_outputs = model(
                    input_ids=batch["rejected_input_ids"],
                    attention_mask=batch["rejected_attention_mask"],
                    output_hidden_states=True
                )
                chosen_last_idx = batch["chosen_attention_mask"].sum(dim=1) - 1
                rejected_last_idx = batch["rejected_attention_mask"].sum(dim=1) - 1
                batch_indices = torch.arange(chosen_outputs.hidden_states[-1].size(0), device=chosen_outputs.hidden_states[-1].device)

                chosen_last_hidden = chosen_outputs.hidden_states[-1][batch_indices, chosen_last_idx, :]
                rejected_last_hidden = rejected_outputs.hidden_states[-1][batch_indices, rejected_last_idx, :]

                chosen_rewards = model.reward_head(chosen_last_hidden).squeeze(-1)
                rejected_rewards = model.reward_head(rejected_last_hidden).squeeze(-1)

                chosen_rewards = chosen_rewards.clamp(-10, 10)
                rejected_rewards = rejected_rewards.clamp(-10, 10)

                loss = torch.nn.functional.softplus(-(chosen_rewards - rejected_rewards)).mean()

                loss = loss / gradient_accumulation_steps

            accelerator.backward(loss)
            epoch_loss += loss.item() * gradient_accumulation_steps  # un-scaled per-batch loss

            # gradient clipping on unwrapped model parameters (PEFT might wrap)
            if accelerator.sync_gradients:
                clip_grad_norm_(model.parameters(), max_norm=1.0)

            # optimizer step handled by accelerator when accumulate finishes
            if accelerator.sync_gradients:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1

                # Logging
                if global_step % logging_steps == 0:
                    adjusted_steps = step - (starting_step * gradient_accumulation_steps - (epoch * len(train_dataloader))) #how many batches you processed this epoch
                    avg_loss = epoch_loss * train_batch_size / ((adjusted_steps + 1) * train_batch_size) #epoch_loss is sum of per-batch mean losses
                    #(step + 1) * train_batch_size is number of samples seen so far
                    print(f"Epoch {epoch+1} Step {global_step} loss {avg_loss:.4f}")
                    writer.add_scalar("train/loss", avg_loss, global_step)
                    writer.add_scalar("train/lr", scheduler.get_last_lr()[0], global_step)

                # Evaluation
                if global_step % eval_steps == 0:
                    model.eval()
                    total_eval_loss = 0.0
                    batches = 0
                    accuracy = 0.0
                    for eval_batch in eval_dataloader:
                        with torch.no_grad():
                            with torch.cuda.amp.autocast(enabled=accelerator.mixed_precision=="fp16"):
                                chosen_outputs = model(
                                    input_ids=eval_batch["chosen_input_ids"],
                                    attention_mask=eval_batch["chosen_attention_mask"],
                                    output_hidden_states=True
                                )
                                rejected_outputs = model(
                                    input_ids=eval_batch["rejected_input_ids"],
                                    attention_mask=eval_batch["rejected_attention_mask"],
                                    output_hidden_states=True
                                )
                                chosen_last_idx = eval_batch["chosen_attention_mask"].sum(dim=1) - 1
                                rejected_last_idx = eval_batch["rejected_attention_mask"].sum(dim=1) - 1
                                batch_indices = torch.arange(chosen_outputs.hidden_states[-1].size(0), device=chosen_outputs.hidden_states[-1].device)

                                chosen_last_hidden = chosen_outputs.hidden_states[-1][batch_indices, chosen_last_idx, :]
                                rejected_last_hidden = rejected_outputs.hidden_states[-1][batch_indices, rejected_last_idx, :]

                                chosen_rewards = model.reward_head(chosen_last_hidden).squeeze(-1)
                                rejected_rewards = model.reward_head(rejected_last_hidden).squeeze(-1)

                                chosen_rewards = chosen_rewards.clamp(-10, 10)
                                rejected_rewards = rejected_rewards.clamp(-10, 10)

                                loss = torch.nn.functional.softplus(-(chosen_rewards - rejected_rewards)).mean()

                                accuracy += (chosen_rewards > rejected_rewards).float().mean()

                                total_eval_loss += loss.item()
                                batches += 1
                    avg_eval_loss = total_eval_loss / batches
                    accuracy = accuracy / batches
                    print(f"*** Eval at step {global_step}: loss {avg_eval_loss:.4f}")
                    print(f"*** Accuracy at step {global_step}: accuracy {accuracy:.4f}")
                    writer.add_scalar("eval/loss", avg_eval_loss, global_step)
                    model.train()

                # Save checkpoint
                if global_step % save_steps == 0:
                    # Save PEFT adapter and accelerator state
                    save_training_state(save_dir, global_step, accelerator, optimizer, scheduler, scaler)

    # epoch end
    # Save checkpoint at epoch end
    save_training_state(save_dir, f"epoch-{epoch+1}", accelerator, optimizer, scheduler, scaler)

# final save
save_training_state(save_dir, f"final-{global_step}", accelerator, optimizer, scheduler, scaler)
print("Training complete")
writer.close()

  with torch.cuda.amp.autocast(enabled=accelerator.mixed_precision=="fp16"):
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch 1 Step 1450 loss 0.6475
Epoch 1 Step 1500 loss 0.6527
Epoch 1 Step 1550 loss 0.6532
Epoch 1 Step 1600 loss 0.6506


  with torch.cuda.amp.autocast(enabled=accelerator.mixed_precision=="fp16"):


*** Eval at step 1600: loss 0.6510
*** Accuracy at step 1600: accuracy 0.6119




Saved checkpoint to reward-model-accel/checkpoint-1600
Epoch 1 Step 1650 loss 0.6531
Epoch 1 Step 1700 loss 0.6528
Epoch 1 Step 1750 loss 0.6501
Epoch 1 Step 1800 loss 0.6486
*** Eval at step 1800: loss 0.6510
*** Accuracy at step 1800: accuracy 0.6125




Saved checkpoint to reward-model-accel/checkpoint-1800
Deleting old checkpoint: reward-model-accel/checkpoint-1400
Epoch 1 Step 1850 loss 0.6489
Epoch 1 Step 1900 loss 0.6510
Epoch 1 Step 1950 loss 0.6533
Epoch 1 Step 2000 loss 0.6543
*** Eval at step 2000: loss 0.6503
*** Accuracy at step 2000: accuracy 0.6127




Saved checkpoint to reward-model-accel/checkpoint-2000
Deleting old checkpoint: reward-model-accel/checkpoint-1600
Epoch 1 Step 2050 loss 0.6530
Epoch 1 Step 2100 loss 0.6519
Epoch 1 Step 2150 loss 0.6521
Epoch 1 Step 2200 loss 0.6525
*** Eval at step 2200: loss 0.6501
*** Accuracy at step 2200: accuracy 0.6129




Saved checkpoint to reward-model-accel/checkpoint-2200
Deleting old checkpoint: reward-model-accel/checkpoint-1800
Epoch 1 Step 2250 loss 0.6521
Epoch 1 Step 2300 loss 0.6523
Epoch 1 Step 2350 loss 0.6517
Epoch 1 Step 2400 loss 0.6512
*** Eval at step 2400: loss 0.6500
*** Accuracy at step 2400: accuracy 0.6137




Saved checkpoint to reward-model-accel/checkpoint-2400
Deleting old checkpoint: reward-model-accel/checkpoint-2000
Epoch 1 Step 2450 loss 0.6516
Epoch 1 Step 2500 loss 0.6512
Epoch 1 Step 2550 loss 0.6509
Epoch 1 Step 2600 loss 0.6507
*** Eval at step 2600: loss 0.6498
*** Accuracy at step 2600: accuracy 0.6141




Saved checkpoint to reward-model-accel/checkpoint-2600
Deleting old checkpoint: reward-model-accel/checkpoint-2200
Epoch 1 Step 2650 loss 0.6506
Epoch 1 Step 2700 loss 0.6514
Epoch 1 Step 2750 loss 0.6510
Epoch 1 Step 2800 loss 0.6511
*** Eval at step 2800: loss 0.6496
*** Accuracy at step 2800: accuracy 0.6147




Saved checkpoint to reward-model-accel/checkpoint-2800
Deleting old checkpoint: reward-model-accel/checkpoint-2400
Epoch 1 Step 2850 loss 0.6505
Epoch 1 Step 2900 loss 0.6503
Epoch 1 Step 2950 loss 0.6510
Epoch 1 Step 3000 loss 0.6509
*** Eval at step 3000: loss 0.6491
*** Accuracy at step 3000: accuracy 0.6131




Saved checkpoint to reward-model-accel/checkpoint-3000
Deleting old checkpoint: reward-model-accel/checkpoint-2600
Epoch 1 Step 3050 loss 0.6513
Epoch 1 Step 3100 loss 0.6520
Epoch 1 Step 3150 loss 0.6524
Epoch 1 Step 3200 loss 0.6527
*** Eval at step 3200: loss 0.6489
*** Accuracy at step 3200: accuracy 0.6137




Saved checkpoint to reward-model-accel/checkpoint-3200
Deleting old checkpoint: reward-model-accel/checkpoint-2800
Epoch 1 Step 3250 loss 0.6529
Epoch 1 Step 3300 loss 0.6527
Epoch 1 Step 3350 loss 0.6525
Epoch 1 Step 3400 loss 0.6523
*** Eval at step 3400: loss 0.6487
*** Accuracy at step 3400: accuracy 0.6143




Saved checkpoint to reward-model-accel/checkpoint-3400
Deleting old checkpoint: reward-model-accel/checkpoint-3000
Epoch 1 Step 3450 loss 0.6524
Epoch 1 Step 3500 loss 0.6523
Epoch 1 Step 3550 loss 0.6520
Epoch 1 Step 3600 loss 0.6523
*** Eval at step 3600: loss 0.6488
*** Accuracy at step 3600: accuracy 0.6143




Saved checkpoint to reward-model-accel/checkpoint-3600
Deleting old checkpoint: reward-model-accel/checkpoint-3200
Epoch 1 Step 3650 loss 0.6521
Epoch 1 Step 3700 loss 0.6521
Epoch 1 Step 3750 loss 0.6521
Epoch 1 Step 3800 loss 0.6519
*** Eval at step 3800: loss 0.6486
*** Accuracy at step 3800: accuracy 0.6141




Saved checkpoint to reward-model-accel/checkpoint-3800
Deleting old checkpoint: reward-model-accel/checkpoint-3400
Epoch 1 Step 3850 loss 0.6523
Epoch 1 Step 3900 loss 0.6519
Epoch 1 Step 3950 loss 0.6514
Epoch 1 Step 4000 loss 0.6516
*** Eval at step 4000: loss 0.6484
*** Accuracy at step 4000: accuracy 0.6155




Saved checkpoint to reward-model-accel/checkpoint-4000
Deleting old checkpoint: reward-model-accel/checkpoint-3600


##Push to the Hugging Face Hub

In [29]:
ckpt_dir = Path("reward-model-accel/checkpoint-4000")
adapter_dir = ckpt_dir / "adapter"
reward_head_path = ckpt_dir / "reward_head.pt"

model = PeftModel.from_pretrained(model, str(adapter_dir))

if reward_head_path.exists():
    model.reward_head.load_state_dict(torch.load(reward_head_path, map_location="cpu"))
    print("Loaded reward_head state dict")




Loaded reward_head state dict


In [30]:
model.push_to_hub("ArnavM3434/reward-model-2nd-try")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   3%|2         |  551kB / 19.7MB            

CommitInfo(commit_url='https://huggingface.co/ArnavM3434/reward-model-2nd-try/commit/3c195f73028ab9a9501c26e3f0bf2c415d3000f0', commit_message='Upload model', commit_description='', oid='3c195f73028ab9a9501c26e3f0bf2c415d3000f0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ArnavM3434/reward-model-2nd-try', endpoint='https://huggingface.co', repo_type='model', repo_id='ArnavM3434/reward-model-2nd-try'), pr_revision=None, pr_num=None)

In [31]:
from huggingface_hub import HfApi

repo_name = "ArnavM3434/reward-model-2nd-try"
reward_head_path = "reward-model-accel/checkpoint-4000/reward_head.pt"

api = HfApi()
api.upload_file(
    path_or_fileobj=reward_head_path,
    path_in_repo="reward_head.pt",
    repo_id=repo_name,
    token=True
)

print("reward_head.pt uploaded to Hugging Face Hub")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...point-4000/reward_head.pt: 100%|##########| 5.00kB / 5.00kB            

reward_head.pt uploaded to Hugging Face Hub
