##Load the SFT Model and Add Scalar Reward Head

In [1]:
!pip install transformers datasets accelerate trl peft tensorboard


Collecting trl
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.24.0-py3-none-any.whl (423 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.24.0


In [2]:
import os
import math
import json
from pathlib import Path
from typing import Dict, Any
import glob
import shutil

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_

from peft import LoraConfig, get_peft_model, PeftModel

from torch.utils.tensorboard import SummaryWriter

from transformers import get_linear_schedule_with_warmup

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
workspace = 'RLHF-Reproduction'

In [5]:
import os
os.chdir(f"/content/drive/My Drive/{workspace}")
print("Current working dir:", os.getcwd())


Current working dir: /content/drive/My Drive/RLHF-Reproduction


In [6]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, PeftModel

base_model = "gpt2"
adapter_path = "ArnavM3434/gpt2-alpaca-lora"
model = AutoModelForCausalLM.from_pretrained(base_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
LORA_CONFIG = dict(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

lora_config = LoraConfig(**LORA_CONFIG)

model = get_peft_model(model, lora_config)



In [9]:
model = PeftModel.from_pretrained(model, adapter_path)

adapter_config.json:   0%|          | 0.00/822 [00:00<?, ?B/s]



adapter_model.safetensors:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

In [10]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
def inspect_trainable_params(model):
    total = 0
    trainable = 0
    details = []
    for n, p in model.named_parameters():
        total += p.numel()
        if p.requires_grad:
            trainable += p.numel()
            details.append(n)
    print(f"Trainable params: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")
    print("Example trainable params:", details[:20])
    return details

In [None]:
trainable_before = inspect_trainable_params(model)

Trainable params: 0 / 124,734,720 (0.00%)
Example trainable params: []


In [None]:
# Make sure LoRA parameters are trainable
for name, param in model.named_parameters():
    if "lora_" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False  # freeze base model

In [None]:
trainable_after = inspect_trainable_params(model)

Trainable params: 294,912 / 124,734,720 (0.24%)
Example trainable params: ['base_model.model.base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.0.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h.1.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.1.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h.2.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.2.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h.3.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.3.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h.4.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.4.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h.5.attn

Add the Reward Model Head

In [11]:
model.reward_head = nn.Linear(model.config.hidden_size, 1)

In [12]:
for name, param in model.named_parameters():
    if "lora_" in name or "reward_head" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [None]:
trainable_with_reward_head = inspect_trainable_params(model)

Trainable params: 295,681 / 124,735,489 (0.24%)
Example trainable params: ['base_model.model.base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.0.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h.1.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.1.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h.2.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.2.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h.3.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.3.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h.4.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.4.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h.5.attn

##Preference Dataset

Use Dahoas/rm-static

In [None]:
from datasets import load_dataset
dataset = load_dataset("Dahoas/rm-static")

README.md:   0%|          | 0.00/530 [00:00<?, ?B/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 9a04e1c7-538e-45e6-993b-697c66c11a39)')' thrown while requesting HEAD https://huggingface.co/datasets/Dahoas/rm-static/resolve/64fd53cc91f7cb73b283a6e4f661205e277d23c9/rm-static.py
Retrying in 1s [Retry 1/5].


dataset_infos.json:   0%|          | 0.00/926 [00:00<?, ?B/s]

data/train-00000-of-00001-2a1df75c6bce91(…):   0%|          | 0.00/68.4M [00:00<?, ?B/s]

data/test-00000-of-00001-8c7c51afc6d4598(…):   0%|          | 0.00/4.61M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/76256 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5103 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 76256
    })
    test: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 5103
    })
})

In [None]:
from datasets import DatasetDict
dataset = DatasetDict({
    'train': dataset['train'],
    'validation': dataset['test']
})

dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 76256
    })
    validation: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 5103
    })
})

In [None]:
train_ds = dataset['train']
train_ds[0]

{'prompt': '\n\nHuman: Can you describe the steps to clean fingerprints and smudges from a laptop screen\n\nAssistant: Yes, certainly. To clean your screen, you first need to use a microfiber cloth or soft, damp cloth to gently wipe down the surface of the screen. Next, you’ll want to grab a soft, lint-free, microfiber cleaning cloth and gently rub it back and forth across the screen to remove fingerprints and smudges.\n\nHuman: Can I spray isopropyl alcohol onto the cloth and clean it that way?\n\nAssistant:',
 'response': ' Yes, you can do that to help the cloth pick up even more dirt from the screen. Be sure to always use a clean, soft cloth, not a piece of scratchy, roughened, or textured material, and make sure it’s lint-free.',
 'chosen': ' Yes, you can do that to help the cloth pick up even more dirt from the screen. Be sure to always use a clean, soft cloth, not a piece of scratchy, roughened, or textured material, and make sure it’s lint-free.',
 'rejected': ' Yes, you can spr

In [None]:
train_ds[1]

{'prompt': '\n\nHuman: What are some foods that are good for diabetics?\n\nAssistant: To be honest, some of these are better than others, and they’re a little more like opinions than facts. For example, many of the diets say to limit vegetables with high sugar content, and there’s some debate on the subject, as far as how much of these vegetables are actually bad for diabetics.\n\nHuman: Okay, any other advice?\n\nAssistant:',
 'response': ' What exactly are you asking? There’s a lot of different kinds of diabetic diets. I could try to recommend you some specific foods and recipes. I could help you look up any of the foods, and I could find recipes for them.',
 'chosen': ' What exactly are you asking? There’s a lot of different kinds of diabetic diets. I could try to recommend you some specific foods and recipes. I could help you look up any of the foods, and I could find recipes for them.',
 'rejected': ' Sure, we’ve got information on common mistakes that diabetic patients make with 

In [None]:
val_ds = dataset['validation']
val_ds[0]

{'prompt': "\n\nHuman: I am trying to write a fairy tale. What is the most popular plot?\n\nAssistant: The most popular plot might be “a princess goes to a faraway land, falls in love, and has a magic genie that grants her wishes”.  We can find plenty of examples of this if we search for fairy tales using the search engine Google.\n\nHuman: You can't look anything up on google.\n\nAssistant: OK, so let’s think about some other popular fairy tales.\n\nHuman: Do you like the plot of Hansel and Gretel?\n\nAssistant: Not particularly.\n\nHuman: what don't you like?\n\nAssistant: I feel like the children do very stupid things.\n\nHuman: Should I include a witch on my story?\n\nAssistant: I think a witch is a fairly common fairy tale element, so yes.  However, what kind of plotline would you like to pursue?\n\nHuman: I was thinking I might pursue a feminist plot for a more modern fairytale.\n\nAssistant: I would be happy to help you make your story a more feminist one.  Are you writing a new

In [None]:
def compute_lengths(example):
    chosen_tokens = tokenizer(example['prompt'] + example['chosen']).input_ids
    rejected_tokens = tokenizer(example['prompt'] + example['rejected']).input_ids
    return {'length': max(len(chosen_tokens), len(rejected_tokens))}

lengths = train_ds.map(compute_lengths)
max_len = max(lengths['length'])
print("Max token length in train set:", max_len)

Map:   0%|          | 0/76256 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1042 > 1024). Running this sequence through the model will result in indexing errors


Max token length in train set: 3411


In [None]:
lengths = val_ds.map(compute_lengths)
max_len = max(lengths['length'])
print("Max token length in validation set:", max_len)

Map:   0%|          | 0/5103 [00:00<?, ? examples/s]

Max token length in validation set: 1015


In [None]:
def compute_length(example):
    chosen_tokens = tokenizer(example['prompt'] + example['chosen']).input_ids
    rejected_tokens = tokenizer(example['prompt'] + example['rejected']).input_ids
    return max(len(chosen_tokens), len(rejected_tokens))

In [None]:
train_ds = train_ds.filter(lambda x: compute_length(x) <= 1024)
print(train_ds.num_rows)

Filter:   0%|          | 0/76256 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1042 > 1024). Running this sequence through the model will result in indexing errors


76236


In [None]:
val_ds = val_ds.filter(lambda x: compute_length(x) <= 1024)
print(val_ds.num_rows)

Filter:   0%|          | 0/5103 [00:00<?, ? examples/s]

5103


In [None]:
max_length = 1024

def tokenize_batch(batch):
    chosen_texts = [p + c for p, c in zip(batch["prompt"], batch["chosen"])]
    rejected_texts = [p + r for p, r in zip(batch["prompt"], batch["rejected"])]

    chosen = tokenizer(
        chosen_texts,
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    rejected = tokenizer(
        rejected_texts,
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )

    return {
        "chosen_input_ids": chosen["input_ids"],
        "chosen_attention_mask": chosen["attention_mask"],
        "rejected_input_ids": rejected["input_ids"],
        "rejected_attention_mask": rejected["attention_mask"],
    }

tokenized_train_ds = train_ds.map(
    tokenize_batch,
    batched=True,
    batch_size=1000,
    remove_columns=train_ds.column_names,
)

tokenized_val_ds = val_ds.map(
    tokenize_batch,
    batched=True,
    batch_size=1000,
    remove_columns=val_ds.column_names,
)

Map:   0%|          | 0/76236 [00:00<?, ? examples/s]

Map:   0%|          | 0/5103 [00:00<?, ? examples/s]

In [None]:
print(tokenized_train_ds.column_names)

['chosen_input_ids', 'chosen_attention_mask', 'rejected_input_ids', 'rejected_attention_mask']


In [None]:
print(len(tokenized_train_ds[0]["chosen_input_ids"]))

1024


##Training the Reward Model

In [None]:
train_batch_size = 4
gradient_accumulation_steps = 16
num_epochs = 2
lr = 9e-6 #from InstructGPT paper
weight_decay = 0.0
warmup_steps = 100
save_dir = "./reward-model-accel"
checkpoint_prefix = "checkpoint"
save_steps = 200
logging_steps = 50
eval_steps = 200
fp16 = True
num_workers = 2
max_checkpoints = 2

In [None]:
from accelerate import Accelerator
accelerator = Accelerator(
    mixed_precision="fp16",
    gradient_accumulation_steps=gradient_accumulation_steps
)
device = accelerator.device
print("Running on", device)

Running on cuda


In [None]:
def ensure_dir(path):
    Path(path).mkdir(parents=True, exist_ok=True)

def save_training_state(save_dir: str, step: int, accelerator, optimizer, scheduler, scaler):
    save_dir = Path(save_dir)
    ckpt_dir = save_dir / f"{checkpoint_prefix}-{step}"
    ensure_dir(ckpt_dir)

    # Save PEFT adapter weights (preferred)
    model_to_save = accelerator.unwrap_model(model)
    peft_save_dir = ckpt_dir / "adapter"
    model_to_save.save_pretrained(peft_save_dir)

    if hasattr(model, "reward_head"):
        reward_head_path = ckpt_dir / "reward_head.pt"
        torch.save(model.reward_head.state_dict(), reward_head_path)

    # Save optimizer/scheduler/scaler states
    accelerator.save_state(str(ckpt_dir / "acc_state"))

    # Save meta info
    meta = {"step": step}
    (ckpt_dir / "meta.json").write_text(json.dumps(meta))

    print(f"Saved checkpoint to {ckpt_dir}")

    # --- Delete old checkpoints if more than MAX_CHECKPOINTS ---
    all_ckpts = sorted(glob.glob(str(save_dir / f"{checkpoint_prefix}-*")),
                       key=lambda x: Path(x).stat().st_mtime)
    while len(all_ckpts) > max_checkpoints:
        old_ckpt = Path(all_ckpts.pop(0))
        print(f"Deleting old checkpoint: {old_ckpt}")
        shutil.rmtree(old_ckpt)


In [None]:
def collate_fn(batch):
    chosen_input_ids = torch.tensor([b["chosen_input_ids"] for b in batch], dtype=torch.long)
    chosen_attention_mask = torch.tensor([b["chosen_attention_mask"] for b in batch], dtype=torch.long)
    rejected_input_ids = torch.tensor([b["rejected_input_ids"] for b in batch], dtype=torch.long)
    rejected_attention_mask = torch.tensor([b["rejected_attention_mask"] for b in batch], dtype=torch.long)
    return {"chosen_input_ids": chosen_input_ids, "chosen_attention_mask": chosen_attention_mask, "rejected_input_ids": rejected_input_ids, "rejected_attention_mask": rejected_attention_mask}

In [None]:
ensure_dir(save_dir)

In [None]:
writer = SummaryWriter(log_dir=os.path.join(save_dir, "tensorboard"))

In [None]:
model.gradient_checkpointing_enable()

In [None]:
for n, p in model.named_parameters():
    if p.requires_grad:
        p.data = p.data.to(accelerator.device)  # ensure params are on same device

In [None]:
train_dataloader = DataLoader(tokenized_train_ds, shuffle=True, collate_fn=collate_fn,batch_size=train_batch_size, num_workers=num_workers)
eval_dataloader = DataLoader(tokenized_val_ds, shuffle=False, collate_fn=collate_fn, batch_size=train_batch_size, num_workers=num_workers)

In [None]:
no_decay = ["bias", "LayerNorm.weight"]
param_groups = [
        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": weight_decay},
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
optimizer = AdamW(param_groups, lr=lr)

In [None]:
total_train_steps = math.ceil(len(train_dataloader) * num_epochs / gradient_accumulation_steps)

In [None]:
from transformers import get_cosine_schedule_with_warmup

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_train_steps,
    num_cycles=0.5,
)

In [None]:
model, optimizer, train_dataloader, eval_dataloader, scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, scheduler
    )

In [None]:
starting_step = 0
scaler = torch.cuda.amp.GradScaler(enabled=(accelerator.state.mixed_precision == "fp16"))

  scaler = torch.cuda.amp.GradScaler(enabled=(accelerator.state.mixed_precision == "fp16"))


In [None]:
print(starting_step)

0


In [None]:
global_step = starting_step
model.train()
print("***** Running training *****")
print(f"  Num examples = {len(tokenized_train_ds)}")
print(f"  Num Epochs = {num_epochs}")
print(f"  Instantaneous batch size per device = {train_batch_size}")
print(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
print(f"  Total optimization steps = {total_train_steps}")

***** Running training *****
  Num examples = 76236
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Gradient Accumulation steps = 16
  Total optimization steps = 2383


In [None]:
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for step, batch in enumerate(train_dataloader):

        #in case we are resuming training
        accumulated_steps = (epoch * len(train_dataloader) + step) // gradient_accumulation_steps
        if accumulated_steps < starting_step:
            continue

        # batch: dict of tensors
        with accelerator.accumulate(model):
            # forward with amp context
            with torch.cuda.amp.autocast(enabled=accelerator.mixed_precision=="fp16"):
                chosen_outputs = model(
                    input_ids=batch["chosen_input_ids"],
                    attention_mask=batch["chosen_attention_mask"],
                    output_hidden_states=True
                )
                rejected_outputs = model(
                    input_ids=batch["rejected_input_ids"],
                    attention_mask=batch["rejected_attention_mask"],
                    output_hidden_states=True
                )
                chosen_last_idx = batch["chosen_attention_mask"].sum(dim=1) - 1
                rejected_last_idx = batch["rejected_attention_mask"].sum(dim=1) - 1
                batch_indices = torch.arange(chosen_outputs.hidden_states[-1].size(0), device=chosen_outputs.hidden_states[-1].device)

                chosen_last_hidden = chosen_outputs.hidden_states[-1][batch_indices, chosen_last_idx, :]
                rejected_last_hidden = rejected_outputs.hidden_states[-1][batch_indices, rejected_last_idx, :]

                chosen_rewards = model.reward_head(chosen_last_hidden).squeeze(-1)
                rejected_rewards = model.reward_head(rejected_last_hidden).squeeze(-1)

                loss = -torch.log(torch.sigmoid(chosen_rewards - rejected_rewards)).mean()

                loss = loss / gradient_accumulation_steps

            accelerator.backward(loss)
            epoch_loss += loss.item() * gradient_accumulation_steps  # un-scaled per-batch loss

            # gradient clipping on unwrapped model parameters (PEFT might wrap)
            if accelerator.sync_gradients:
                clip_grad_norm_(model.parameters(), max_norm=1.0)

            # optimizer step handled by accelerator when accumulate finishes
            if accelerator.sync_gradients:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1

                # Logging
                if global_step % logging_steps == 0:
                    # if (epoch == 2) :
                    #   adjusted_steps = step - (starting_step * gradient_accumulation_steps - (epoch * len(train_dataloader)))
                    # else:
                    #   adjusted_steps = step
                    adjusted_steps = step
                    avg_loss = epoch_loss * train_batch_size / ((adjusted_steps + 1) * train_batch_size) #epoch_loss is sum of per-batch mean losses
                    #(step + 1) * train_batch_size is number of samples seen so far
                    print(f"Epoch {epoch+1} Step {global_step} loss {avg_loss:.4f}")
                    writer.add_scalar("train/loss", avg_loss, global_step)
                    writer.add_scalar("train/lr", scheduler.get_last_lr()[0], global_step)

                # Evaluation
                if global_step % eval_steps == 0:
                    model.eval()
                    total_eval_loss = 0.0
                    batches = 0
                    accuracy = 0.0
                    for eval_batch in eval_dataloader:
                        with torch.no_grad():
                            with torch.cuda.amp.autocast(enabled=accelerator.mixed_precision=="fp16"):
                                chosen_outputs = model(
                                    input_ids=batch["chosen_input_ids"],
                                    attention_mask=batch["chosen_attention_mask"],
                                    output_hidden_states=True
                                )
                                rejected_outputs = model(
                                    input_ids=batch["rejected_input_ids"],
                                    attention_mask=batch["rejected_attention_mask"],
                                    output_hidden_states=True
                                )
                                chosen_last_idx = batch["chosen_attention_mask"].sum(dim=1) - 1
                                rejected_last_idx = batch["rejected_attention_mask"].sum(dim=1) - 1
                                batch_indices = torch.arange(chosen_outputs.hidden_states[-1].size(0), device=chosen_outputs.hidden_states[-1].device)

                                chosen_last_hidden = chosen_outputs.hidden_states[-1][batch_indices, chosen_last_idx, :]
                                rejected_last_hidden = rejected_outputs.hidden_states[-1][batch_indices, rejected_last_idx, :]

                                chosen_rewards = model.reward_head(chosen_last_hidden).squeeze(-1)
                                rejected_rewards = model.reward_head(rejected_last_hidden).squeeze(-1)

                                loss = -torch.log(torch.sigmoid(chosen_rewards - rejected_rewards)).mean()

                                accuracy += (chosen_rewards > rejected_rewards).float().mean()

                                total_eval_loss += loss.item()
                                batches += 1
                    avg_eval_loss = total_eval_loss / batches
                    accuracy = accuracy / batches
                    print(f"*** Eval at step {global_step}: loss {avg_eval_loss:.4f}")
                    print(f"*** Accuracy at step {global_step}: accuracy {accuracy:.4f}")
                    writer.add_scalar("eval/loss", avg_eval_loss, global_step)
                    model.train()

                # Save checkpoint
                if global_step % save_steps == 0:
                    # Save PEFT adapter and accelerator state
                    save_training_state(save_dir, global_step, accelerator, optimizer, scheduler, scaler)

    # epoch end
    # Save checkpoint at epoch end
    save_training_state(save_dir, f"epoch-{epoch+1}", accelerator, optimizer, scheduler, scaler)

# final save
save_training_state(save_dir, f"final-{global_step}", accelerator, optimizer, scheduler, scaler)
print("Training complete")
writer.close()

  with torch.cuda.amp.autocast(enabled=accelerator.mixed_precision=="fp16"):
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch 1 Step 50 loss 1.0223
Epoch 1 Step 100 loss 1.0031
Epoch 1 Step 150 loss 0.9963
Epoch 1 Step 200 loss 0.9835


  with torch.cuda.amp.autocast(enabled=accelerator.mixed_precision=="fp16"):


*** Eval at step 200: loss 0.7077
*** Accuracy at step 200: accuracy 0.5000
Saved checkpoint to reward-model-accel/checkpoint-200
Epoch 1 Step 250 loss 0.9704
Epoch 1 Step 300 loss 0.9577
Epoch 1 Step 350 loss 0.9470
Epoch 1 Step 400 loss 0.9337




*** Eval at step 400: loss 0.6510
*** Accuracy at step 400: accuracy 0.5000
Saved checkpoint to reward-model-accel/checkpoint-400
Epoch 1 Step 450 loss 0.9236
Epoch 1 Step 500 loss 0.9131
Epoch 1 Step 550 loss 0.9027
Epoch 1 Step 600 loss 0.8926




*** Eval at step 600: loss 0.9065
*** Accuracy at step 600: accuracy 0.5000
Saved checkpoint to reward-model-accel/checkpoint-600
Deleting old checkpoint: reward-model-accel/checkpoint-200
Epoch 1 Step 650 loss 0.8825
Epoch 1 Step 700 loss 0.8748
Epoch 1 Step 750 loss 0.8656
Epoch 1 Step 800 loss 0.8598




*** Eval at step 800: loss 0.7774
*** Accuracy at step 800: accuracy 0.7500
Saved checkpoint to reward-model-accel/checkpoint-800
Deleting old checkpoint: reward-model-accel/checkpoint-400
Epoch 1 Step 850 loss 0.8535
Epoch 1 Step 900 loss 0.8473
Epoch 1 Step 950 loss 0.8409
Epoch 1 Step 1000 loss 0.8355




*** Eval at step 1000: loss 0.8490
*** Accuracy at step 1000: accuracy 0.2500
Saved checkpoint to reward-model-accel/checkpoint-1000
Deleting old checkpoint: reward-model-accel/checkpoint-600
Epoch 1 Step 1050 loss 0.8305
Epoch 1 Step 1100 loss 0.8255
Epoch 1 Step 1150 loss 0.8215




Saved checkpoint to reward-model-accel/checkpoint-epoch-1
Deleting old checkpoint: reward-model-accel/checkpoint-800
Epoch 2 Step 1200 loss 0.7021




*** Eval at step 1200: loss 0.6293
*** Accuracy at step 1200: accuracy 0.7500
Saved checkpoint to reward-model-accel/checkpoint-1200
Deleting old checkpoint: reward-model-accel/checkpoint-1000
Epoch 2 Step 1250 loss 0.7261
Epoch 2 Step 1300 loss 0.7290
Epoch 2 Step 1350 loss 0.7250
Epoch 2 Step 1400 loss 0.7239




*** Eval at step 1400: loss 0.8421
*** Accuracy at step 1400: accuracy 0.2500
Saved checkpoint to reward-model-accel/checkpoint-1400
Deleting old checkpoint: reward-model-accel/checkpoint-epoch-1
Epoch 2 Step 1450 loss 0.7209
Epoch 2 Step 1500 loss 0.7197
Epoch 2 Step 1550 loss 0.7190
Epoch 2 Step 1600 loss 0.7185




*** Eval at step 1600: loss 0.7821
*** Accuracy at step 1600: accuracy 0.7500
Saved checkpoint to reward-model-accel/checkpoint-1600
Deleting old checkpoint: reward-model-accel/checkpoint-1200
Epoch 2 Step 1650 loss 0.7174
Epoch 2 Step 1700 loss 0.7174
Epoch 2 Step 1750 loss 0.7173
Epoch 2 Step 1800 loss 0.7176




*** Eval at step 1800: loss 0.5481
*** Accuracy at step 1800: accuracy 0.5000
Saved checkpoint to reward-model-accel/checkpoint-1800
Deleting old checkpoint: reward-model-accel/checkpoint-1400
Epoch 2 Step 1850 loss 0.7172
Epoch 2 Step 1900 loss 0.7173
Epoch 2 Step 1950 loss 0.7166
Epoch 2 Step 2000 loss 0.7154




*** Eval at step 2000: loss 0.6420
*** Accuracy at step 2000: accuracy 0.7500
Saved checkpoint to reward-model-accel/checkpoint-2000
Deleting old checkpoint: reward-model-accel/checkpoint-1600
Epoch 2 Step 2050 loss 0.7151
Epoch 2 Step 2100 loss 0.7149
Epoch 2 Step 2150 loss 0.7145
Epoch 2 Step 2200 loss 0.7145




*** Eval at step 2200: loss 0.6690
*** Accuracy at step 2200: accuracy 0.5000
Saved checkpoint to reward-model-accel/checkpoint-2200
Deleting old checkpoint: reward-model-accel/checkpoint-1800
Epoch 2 Step 2250 loss 0.7145
Epoch 2 Step 2300 loss 0.7142
Epoch 2 Step 2350 loss 0.7142




Saved checkpoint to reward-model-accel/checkpoint-epoch-2
Deleting old checkpoint: reward-model-accel/checkpoint-2000




Saved checkpoint to reward-model-accel/checkpoint-final-2384
Deleting old checkpoint: reward-model-accel/checkpoint-2200
Training complete


In [None]:
model.eval()
total_eval_loss = 0.0
batches = 0
accuracy = 0.0
for eval_batch in eval_dataloader:
    with torch.no_grad():
        with torch.cuda.amp.autocast(enabled=accelerator.mixed_precision=="fp16"):
            chosen_outputs = model(
                input_ids=batch["chosen_input_ids"],
                attention_mask=batch["chosen_attention_mask"],
                output_hidden_states=True
            )
            rejected_outputs = model(
                input_ids=batch["rejected_input_ids"],
                attention_mask=batch["rejected_attention_mask"],
                output_hidden_states=True
            )
            chosen_last_idx = batch["chosen_attention_mask"].sum(dim=1) - 1
            rejected_last_idx = batch["rejected_attention_mask"].sum(dim=1) - 1
            batch_indices = torch.arange(chosen_outputs.hidden_states[-1].size(0), device=chosen_outputs.hidden_states[-1].device)

            chosen_last_hidden = chosen_outputs.hidden_states[-1][batch_indices, chosen_last_idx, :]
            rejected_last_hidden = rejected_outputs.hidden_states[-1][batch_indices, rejected_last_idx, :]

            chosen_rewards = model.reward_head(chosen_last_hidden).squeeze(-1)
            rejected_rewards = model.reward_head(rejected_last_hidden).squeeze(-1)

            loss = -torch.log(torch.sigmoid(chosen_rewards - rejected_rewards)).mean()

            accuracy += (chosen_rewards > rejected_rewards).float().mean()

            total_eval_loss += loss.item()
            batches += 1
avg_eval_loss = total_eval_loss / batches
accuracy = accuracy / batches
print(f"*** Eval at step {global_step}: loss {avg_eval_loss:.4f}")
print(f"*** Accuracy at step {global_step}: accuracy {accuracy:.4f}")
model.train()

  with torch.cuda.amp.autocast(enabled=accelerator.mixed_precision=="fp16"):


*** Eval at step 2384: loss 0.6076
*** Accuracy at step 2384: accuracy 0.7500


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): GPT2LMHeadModel(
          (transformer): GPT2Model(
            (wte): Embedding(50257, 768)
            (wpe): Embedding(1024, 768)
            (drop): Dropout(p=0.1, inplace=False)
            (h): ModuleList(
              (0-11): 12 x GPT2Block(
                (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
                (attn): GPT2Attention(
                  (c_attn): lora.Linear(
                    (base_layer): Conv1D(nf=2304, nx=768)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=2304, bi

##Push to the Hugging Face Hub

In [13]:
ckpt_dir = "reward-model-accel/checkpoint-final-2384"
adapter_dir = f"{ckpt_dir}/adapter"

model.load_adapter(adapter_dir, adapter_name="default")

reward_head_path = f"{ckpt_dir}/reward_head.pt"
state_dict = torch.load(reward_head_path, map_location="cpu")
model.reward_head.load_state_dict(state_dict)


repo_name = "ArnavM3434/reward-model-gpt2"
#model.push_to_hub(repo_name)

In [14]:
from huggingface_hub import HfApi

repo_name = "ArnavM3434/reward-model-gpt2"
reward_head_path = "reward-model-accel/checkpoint-final-2384/reward_head.pt"

api = HfApi()
api.upload_file(
    path_or_fileobj=reward_head_path,
    path_in_repo="reward_head.pt",
    repo_id=repo_name,
    token=True
)

print("reward_head.pt uploaded to Hugging Face Hub")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...final-2384/reward_head.pt: 100%|##########| 5.00kB / 5.00kB            

reward_head.pt uploaded to Hugging Face Hub
