##Load SFT and Reward Model

In [1]:
!pip install trl==0.11 transformers accelerate datasets torch




In [2]:
import os
import math
import json
from pathlib import Path
from typing import Dict, Any
import glob
import shutil
import wandb
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_

from peft import LoraConfig, get_peft_model, PeftModel

from torch.utils.tensorboard import SummaryWriter

from transformers import get_linear_schedule_with_warmup

In [3]:
from trl import create_reference_model

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
workspace = 'PPO-Real'

In [6]:
import os
os.chdir(f"/content/drive/My Drive/{workspace}")
print("Current working dir:", os.getcwd())

Current working dir: /content/drive/My Drive/PPO-Real


In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33marnavnmehta1[0m ([33marnavnmehta1-nutanix[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from trl import AutoModelForCausalLMWithValueHead

base_model = "gpt2"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [37]:
policy_model = AutoModelForCausalLMWithValueHead.from_pretrained("ArnavM3434/gpt2-alpaca-second-try")



In [13]:
policy_model.to(device)
test_prompts = [
    "What is machine learning?",
    "Explain reinforcement learning",
    "How does PPO work?",
]

for prompt in test_prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = policy_model.generate(
        **inputs,
        max_new_tokens=512,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=3,
        length_penalty=1.0,
        do_sample=False,
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Prompt: {prompt}")
    print(f"Response: {response}\n")
    print("-" * 80)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: What is machine learning?
Response: What is machine learning?
Machine learning is a type of artificial intelligence (AI) that uses machine learning algorithms to analyze data and create predictive models. Machine learning algorithms can be used to identify patterns in data, predict patterns in the data, and then use those patterns to predict future outcomes. This type of machine learning can be applied to a wide range of industries, such as healthcare, transportation, and financial services.

--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: Explain reinforcement learning
Response: Explain reinforcement learning. Reinforcement learning is the process by which a neural network is trained to learn a task. It involves learning a set of inputs and outputs, and then training the network to perform the task. In reinforcement learning, the network learns from the input and outputs of the task, and when the network is able to perform a task correctly, it is rewarded with a reward. In other words, reinforcement learning can be used to train networks to perform complex tasks.

--------------------------------------------------------------------------------
Prompt: How does PPO work?
Response: How does PPO work?

PPO works by sending a message to the user via email or text message. The message is then sent to the PPO server, where it is processed and sent back to the sender. PPO can also be used to send messages to other users, such as by sending an email to a friend or family member, or sending a text message to someone else

In [12]:
ref_model = create_reference_model(policy_model)

In [13]:
for param in ref_model.parameters():
    param.requires_grad = False


In [14]:
def inspect_trainable_params(model):
    total = 0
    trainable = 0
    details = []
    for n, p in model.named_parameters():
        total += p.numel()
        if p.requires_grad:
            trainable += p.numel()
            details.append(n)
    print(f"Trainable params: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")
    print("Example trainable params:", details[:20])
    return details

In [15]:
inspect_trainable_params(policy_model)

Trainable params: 769 / 129,374,753 (0.00%)
Example trainable params: ['v_head.summary.weight', 'v_head.summary.bias']


['v_head.summary.weight', 'v_head.summary.bias']

In [16]:
def make_trainable(model):
  for name, param in model.named_parameters():
      if "lora_" in name or "v_head" in name:
          param.requires_grad = True
      else:
          param.requires_grad = False

In [38]:
make_trainable(policy_model)

In [18]:
inspect_trainable_params(policy_model)

Trainable params: 4,934,945 / 129,374,753 (3.81%)
Example trainable params: ['pretrained_model.base_model.model.transformer.wte.lora_embedding_A.default', 'pretrained_model.base_model.model.transformer.wte.lora_embedding_B.default', 'pretrained_model.base_model.model.transformer.wpe.lora_embedding_A.default', 'pretrained_model.base_model.model.transformer.wpe.lora_embedding_B.default', 'pretrained_model.base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight', 'pretrained_model.base_model.model.transformer.h.0.attn.c_attn.lora_B.default.weight', 'pretrained_model.base_model.model.transformer.h.0.attn.c_proj.lora_A.default.weight', 'pretrained_model.base_model.model.transformer.h.0.attn.c_proj.lora_B.default.weight', 'pretrained_model.base_model.model.transformer.h.0.mlp.c_proj.lora_A.default.weight', 'pretrained_model.base_model.model.transformer.h.0.mlp.c_proj.lora_B.default.weight', 'pretrained_model.base_model.model.transformer.h.1.attn.c_attn.lora_A.default.weight', 'pre

['pretrained_model.base_model.model.transformer.wte.lora_embedding_A.default',
 'pretrained_model.base_model.model.transformer.wte.lora_embedding_B.default',
 'pretrained_model.base_model.model.transformer.wpe.lora_embedding_A.default',
 'pretrained_model.base_model.model.transformer.wpe.lora_embedding_B.default',
 'pretrained_model.base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight',
 'pretrained_model.base_model.model.transformer.h.0.attn.c_attn.lora_B.default.weight',
 'pretrained_model.base_model.model.transformer.h.0.attn.c_proj.lora_A.default.weight',
 'pretrained_model.base_model.model.transformer.h.0.attn.c_proj.lora_B.default.weight',
 'pretrained_model.base_model.model.transformer.h.0.mlp.c_proj.lora_A.default.weight',
 'pretrained_model.base_model.model.transformer.h.0.mlp.c_proj.lora_B.default.weight',
 'pretrained_model.base_model.model.transformer.h.1.attn.c_attn.lora_A.default.weight',
 'pretrained_model.base_model.model.transformer.h.1.attn.c_attn.lora_B

Load Reward Model

In [19]:
reward_model = AutoModelForSequenceClassification.from_pretrained("OpenAssistant/reward-model-deberta-v3-base")
reward_tokenizer = AutoTokenizer.from_pretrained("OpenAssistant/reward-model-deberta-v3-base")

In [25]:
inspect_trainable_params(reward_model)

Trainable params: 184,422,913 / 184,422,913 (100.00%)
Example trainable params: ['deberta.embeddings.word_embeddings.weight', 'deberta.embeddings.LayerNorm.weight', 'deberta.embeddings.LayerNorm.bias', 'deberta.encoder.layer.0.attention.self.query_proj.weight', 'deberta.encoder.layer.0.attention.self.query_proj.bias', 'deberta.encoder.layer.0.attention.self.key_proj.weight', 'deberta.encoder.layer.0.attention.self.key_proj.bias', 'deberta.encoder.layer.0.attention.self.value_proj.weight', 'deberta.encoder.layer.0.attention.self.value_proj.bias', 'deberta.encoder.layer.0.attention.output.dense.weight', 'deberta.encoder.layer.0.attention.output.dense.bias', 'deberta.encoder.layer.0.attention.output.LayerNorm.weight', 'deberta.encoder.layer.0.attention.output.LayerNorm.bias', 'deberta.encoder.layer.0.intermediate.dense.weight', 'deberta.encoder.layer.0.intermediate.dense.bias', 'deberta.encoder.layer.0.output.dense.weight', 'deberta.encoder.layer.0.output.dense.bias', 'deberta.encoder.lay

['deberta.embeddings.word_embeddings.weight',
 'deberta.embeddings.LayerNorm.weight',
 'deberta.embeddings.LayerNorm.bias',
 'deberta.encoder.layer.0.attention.self.query_proj.weight',
 'deberta.encoder.layer.0.attention.self.query_proj.bias',
 'deberta.encoder.layer.0.attention.self.key_proj.weight',
 'deberta.encoder.layer.0.attention.self.key_proj.bias',
 'deberta.encoder.layer.0.attention.self.value_proj.weight',
 'deberta.encoder.layer.0.attention.self.value_proj.bias',
 'deberta.encoder.layer.0.attention.output.dense.weight',
 'deberta.encoder.layer.0.attention.output.dense.bias',
 'deberta.encoder.layer.0.attention.output.LayerNorm.weight',
 'deberta.encoder.layer.0.attention.output.LayerNorm.bias',
 'deberta.encoder.layer.0.intermediate.dense.weight',
 'deberta.encoder.layer.0.intermediate.dense.bias',
 'deberta.encoder.layer.0.output.dense.weight',
 'deberta.encoder.layer.0.output.dense.bias',
 'deberta.encoder.layer.0.output.LayerNorm.weight',
 'deberta.encoder.layer.0.output

In [20]:
reward_model.to(device)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [21]:
reward_model.eval()

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [22]:
for param in reward_model.parameters():
    param.requires_grad = False

##Prompt Dataset

In [23]:
from datasets import load_dataset

dataset = load_dataset("tatsu-lab/alpaca", split="train")

dataset[0]

{'instruction': 'Give three tips for staying healthy.',
 'input': '',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.',
 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}

In [24]:
dataset

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 52002
})

In [25]:
dataset = dataset.remove_columns(["output", "text"])

In [26]:
dataset[0]

{'instruction': 'Give three tips for staying healthy.', 'input': ''}

In [27]:
def tokenize(sample):
    sample["query"] = f"Human: {sample['instruction']} {sample['input']} Assistant: "
    sample["input_ids"] = tokenizer.encode(sample["query"], padding = "max_length", truncation = True, max_length = 128)
    return sample

tokenized_dataset = dataset.map(
    tokenize,
    batched=False,
    remove_columns = ["instruction", "input"]
)

In [28]:
tokenized_dataset[0]['query']

'Human: Give three tips for staying healthy.  Assistant: '

##PPO

In [29]:
from trl import PPOTrainer, PPOConfig
import numpy as np

In [40]:
config = PPOConfig(
    model_name="gpt2",
    learning_rate=5e-7,
    batch_size=4,
    mini_batch_size=2,
    gradient_accumulation_steps=1,
    log_with="wandb",
    target_kl=2.0,
    cliprange=0.05,
    cliprange_value=0.1,
    vf_coef=0.1,
    init_kl_coef=0.5,
    adap_kl_ctrl=True,
    gamma=0.99,
    lam=0.95,
    ppo_epochs=2
)



In [41]:
ppo_trainer = PPOTrainer(
    model=policy_model,
    config=config,
    dataset=tokenized_dataset,
    tokenizer=tokenizer,
    ref_model = ref_model
)



0,1
env/reward_mean,▂▁▁▃▄▃▃▄▄▃▄▄▃▂▃▂▅▄█▃▄▂▃▃▂▃▃▃▇▂▆▆▁
env/reward_std,▄▅▄▄▄▅▅▅▆▃▃▅▅▅▄▅█▅▆▃▆▃▅▅▅▃▅▅▅▅▁▅▅
objective/entropy,▅▃▅▃█▄▆▅▅▃▅▄▄▅▂█▇▃▄▄▆▄▄▇▂▃▅▆▄▁▄▇▅
objective/kl,▇█▄▅▂▃▇▅▄▄▅▁▄▆▇▅▄▃▄▄▂▆▃▂▃▃▃▄▁▃▂█▃
objective/kl_coef,▁▂▃▃▄▃▃▃▄▄▅▆▅▅▆▆▇▇▆▇█▇█▇▆▆▅▄▅▄▃▂▃
ppo/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
ppo/loss/policy,▂▅▁▂▂▂▂▂▂▂▂▁▂▂▃▂▂▂▂▂▂▂▃▂▂▂▁▂▂█▂▂▂
ppo/loss/total,▇▄▇▄▇▂▅▅▆▂▆▅▅▃▅▇▄▄▂▄▁▇█▃▆▃▂▄▇█▆▄▃
ppo/loss/value,▇▃█▅▇▃▅▅▇▃▆▆▅▃▅█▄▅▂▄▁▇█▄▆▄▂▅█▆▆▄▃
ppo/mean_non_score_reward,▃▁▅▅▇▆▃▅▆▅▅█▆▄▂▅▆▆▅▅▇▄▇▇▆▆▇▆█▆▇▃▆

0,1
env/reward_mean,-0.0
env/reward_std,1
objective/entropy,187.76193
objective/kl,4.40438
objective/kl_coef,0.10002
ppo/learning_rate,0.0
ppo/loss/policy,0.00049
ppo/loss/total,0.98053
ppo/loss/value,9.80045
ppo/mean_non_score_reward,-0.0079


In [32]:
def ensure_dir(path):
    Path(path).mkdir(parents=True, exist_ok=True)

save_dir = "./ppo-model"
checkpoint_prefix = "checkpoint"

In [33]:
def save_training_state():
    save_dir = Path(save_dir)
    ckpt_dir = save_dir / f"{checkpoint_prefix}"
    ensure_dir(ckpt_dir)

    model_to_save = policy_model
    peft_save_dir = ckpt_dir / "adapter"
    model_to_save.save_pretrained(peft_save_dir)

    print(f"Saved checkpoint to {ckpt_dir}")

In [42]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 128, # Reduced to prevent OOM,
    "temperature": 0.8,  # Add temperature for more stable sampling
    "repetition_penalty": 1.2  # Prevent repetition
}

In [35]:
import warnings
import logging

warnings.filterwarnings("ignore", message=".*right-padding was detected.*")

In [43]:
from tqdm import tqdm
from torch.cuda.amp import autocast

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):

    #Fix the shape, it should be a list of 4 tensors, each (128, )
    query_tensors_batch = batch["input_ids"]
    query_tensor_2d = torch.stack(query_tensors_batch)
    query_tensor_correct = query_tensor_2d.transpose(0, 1)
    query_tensors = [query_tensor_correct[i] for i in range(query_tensor_correct.size(0))]

    # Step 2: Remove padding tokens from queries before generation
    unpadded_queries = []
    for i, query in enumerate(query_tensors):
        non_pad_mask = query != tokenizer.pad_token_id
        unpadded_query = query[non_pad_mask]
        unpadded_queries.append(unpadded_query)
        #print(f"Query {i}: padded_length={len(query)}, actual_length={len(unpadded_query)}")

    response_tensors_full = ppo_trainer.generate(unpadded_queries, **generation_kwargs)

    # Step 4: Extract only the response tokens
    response_only_tensors = []
    for i, full_response in enumerate(response_tensors_full):
        query_length = len(unpadded_queries[i])
        response_length = len(full_response) - query_length

        #print(f"Sequence {i}: query_length={query_length}, full_response_length={len(full_response)}, response_length={response_length}")

        if response_length > 0:
            response_only = full_response[query_length:]
        else:
            # Fallback if something goes wrong
            print("Something is wrong, the response length is 0")
            response_only = torch.tensor([tokenizer.eos_token_id], device=full_response.device)

        response_only_tensors.append(response_only)

    # Step 5: Decode and verify responses
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_only_tensors]

    # # Print actual responses to verify they're not just EOS
    # for i, (query, resp) in enumerate(zip(batch["query"], batch["response"])):
    #     print(f"Response {i}: '{resp}' (query: '{query[:50]}...')")

    # Step 6: Compute rewards
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    with torch.no_grad():
        reward_inputs = reward_tokenizer(
            texts, padding=True, truncation=True,
            return_tensors="pt", max_length=384
        ).to(device)

        rewards = reward_model(**reward_inputs).logits.squeeze(-1)
        if torch.isnan(rewards).any():
          rewards = torch.nan_to_num(rewards, nan=0.0)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-8)
        rewards = rewards.clamp(-3, 3)
        rewards = [r for r in rewards]

    # Final check
    #print(f"Final - Response lengths: {[len(r) for r in response_only_tensors]}")

    # Use the original padded queries for PPO step (as expected by the trainer)
    stats = ppo_trainer.step(query_tensors, response_only_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

ppo_trainer.save_model("ppo_model")
save_training_state()

13it [01:13,  5.69s/it]


KeyboardInterrupt: 