<a href="https://colab.research.google.com/github/joiakim/Model-Diffing/blob/main/PPO%20code/RLHF_using_PPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade --q transformers peft trl accelerate bitsandbytes datasets

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ['WANDB_MODE'] = 'disabled'

# Supervised fine-tuning

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer

# 1. Load the Pre-trained BASE Model and Tokenizer

model_name = "Qwen/Qwen3-0.6B-Base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map= None,
    trust_remote_code = True
)
#uncomment this when using full dataset
model.gradient_checkpointing_enable()

# Base models often don't have a pad token, so we set it here.
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id


# 2. Load and Prepare the Dataset
dataset_name = "Anthropic/hh-rlhf"
dataset = load_dataset(dataset_name, data_dir="harmless-base", split="train")
#test_dataset = load_dataset(dataset_name, data_dir='harmless-base', split='test')

def format_dataset_for_base_model(example):
    return {"text": example["chosen"]}


dataset = dataset.map(format_dataset_for_base_model)



# 4.The SFT Trainer
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

training_args = TrainingArguments(
    output_dir="./sft_base_output",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=32,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=100,
    num_train_epochs=2,
    #max_steps=100,
    fp16=True,
    bf16= False,
    push_to_hub=False,
)

trainer = SFTTrainer(
    model=model,
    train_dataset= dataset,
    peft_config=peft_config,
    args=training_args,
)

# 5. Start the Training
print("Starting Supervised Fine-Tuning on the BASE model...")
trainer.train()
print("SFT on base model complete!")

# Save the new SFT model
trainer.save_model("./sft_base_model")
print("Model saved!")

`torch_dtype` is deprecated! Use `dtype` instead!


generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Map:   0%|          | 0/42537 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/42537 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/42537 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/42537 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Starting Supervised Fine-Tuning on the BASE model...


A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config.


Step,Training Loss
100,2.1754


SFT on base model complete!


# Reward Model

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments
from peft import LoraConfig
from transformers import AutoModelForSequenceClassification
from trl import RewardTrainer, RewardConfig


# 1. Load the SFT Model and Tokenizer
model = "Qwen/Qwen3-0.6B-Base"

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(
    model,
    num_labels=1,
    #torch_dtype=torch.float16,
    device_map= None,
    trust_remote_code = True
)



# Pad token must be set - crucial for batch processing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

model.config.pad_token_id = tokenizer.pad_token_id

# Verify the pad_token_id is properly set
print(f"Tokenizer pad_token_id: {tokenizer.pad_token_id}")
print(f"Model config pad_token_id: {model.config.pad_token_id}")

# Ensure padding is set to right for sequence classification
tokenizer.padding_side = "right"

# 2. Load and Prepare the Dataset
dataset_name = "Anthropic/hh-rlhf"
train_dataset = load_dataset(dataset_name, data_dir="harmless-base", split="train")
eval_dataset = load_dataset(dataset_name, data_dir="harmless-base", split="test")

def format_dataset(example):
    tokenized_chosen = tokenizer(
        example["chosen"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    tokenized_rejected = tokenizer(
        example["rejected"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    return {
        "input_ids_chosen": tokenized_chosen["input_ids"].squeeze(),
        "attention_mask_chosen": tokenized_chosen["attention_mask"].squeeze(),
        "input_ids_rejected": tokenized_rejected["input_ids"].squeeze(),
        "attention_mask_rejected": tokenized_rejected["attention_mask"].squeeze(),
    }

train_dataset = train_dataset.map(format_dataset)
eval_dataset = eval_dataset.map(format_dataset)

#sample_train = train_dataset.select(range(100))
#sample_eval =  eval_dataset.select(range(10))

# 3. Configure and Set up the Reward Trainer
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="SEQ_CLS",
)

training_args = RewardConfig(
    output_dir="./rm_base_output",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=32,
    #optim="paged_adamw_32bit",
    learning_rate=2e-4,
    eval_strategy="epoch",
    #eval_steps=50,
    save_strategy="epoch",
    #save_steps=50,
    logging_steps=100, #84
    num_train_epochs=2,
    fp16=True,
    bf16=False,
    push_to_hub=False,
    remove_unused_columns=False,
    max_length=512,
    disable_dropout=False,
)

# The trainer correctly receives the base model and the NEW peft_config
trainer = RewardTrainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,

)

# 4. Start the Training
print("Starting Reward Model Training...")
trainer.train()
print("Reward Model training complete!")

# 5. Save the final Reward Model
trainer.save_model("./reward_base_model")
Print("reward Model saved!")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B-Base and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer pad_token_id: 151643
Model config pad_token_id: 151643


README.md: 0.00B [00:00, ?B/s]

harmless-base/train.jsonl.gz:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

harmless-base/test.jsonl.gz:   0%|          | 0.00/743k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/42537 [00:00<?, ? examples/s]

Map:   0%|          | 0/2312 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Starting Reward Model Training...


  | |_| | '_ \/ _` / _` |  _/ -_)
You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.559094,0.715472
2,0.597500,0.551844,0.717781








Reward Model training complete!


In [None]:
# model.push_to_hub("AIPlans/qwen3-0.6b-base-hl-RM")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...s9smc3y/model.safetensors:   2%|2         | 50.2MB / 2.42GB            

CommitInfo(commit_url='https://huggingface.co/AIPlans/qwen3-0.6b-base-hl-RM/commit/f1d3ef2fa7cfcac2e618b2cf96990ca85e2bdd01', commit_message='Upload Qwen3ForSequenceClassification', commit_description='', oid='f1d3ef2fa7cfcac2e618b2cf96990ca85e2bdd01', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AIPlans/qwen3-0.6b-base-hl-RM', endpoint='https://huggingface.co', repo_type='model', repo_id='AIPlans/qwen3-0.6b-base-hl-RM'), pr_revision=None, pr_num=None)

# PPO training

In [None]:
import warnings, torch
from datasets import load_dataset
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from transformers.modeling_outputs import CausalLMOutputWithPast
import pandas as pd
import os

# Suppress warnings for a cleaner output
warnings.filterwarnings("ignore")

# --- 1. A Fully Patched Wrapper Class ---
class PatchedWithValueHead(AutoModelForCausalLMWithValueHead):
    def __init__(self, base_model, **kwargs):
        super().__init__(base_model, **kwargs)
        self.config = base_model.config
        self.generation_config = base_model.generation_config
        self.base_model_prefix = getattr(base_model, "base_model_prefix", "")
        if self.base_model_prefix:
            backbone = getattr(base_model, self.base_model_prefix)
            setattr(self, self.base_model_prefix, backbone)
        self.prepare_inputs_for_generation = base_model.prepare_inputs_for_generation

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: torch.Tensor = None,
        **kwargs,
    ):
        kwargs["output_hidden_states"] = True
        kwargs["return_dict"] = True
        outputs = self.pretrained_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            **kwargs,
        )
        value = self.v_head(outputs.hidden_states[-1]).squeeze(-1)
        outputs.value = value
        return outputs

    def score(self, hidden_states):
        return self.v_head(hidden_states).squeeze(-1)

    @property
    def is_gradient_checkpointing(self) -> bool:
        return (
            getattr(self.pretrained_model, "is_gradient_checkpointing", False)
            or getattr(self.pretrained_model, "gradient_checkpointing", False)
            or getattr(self.pretrained_model, "_gradient_checkpointing", False)
        )

    @property
    def is_peft_model(self):
        return isinstance(self.pretrained_model, PeftModel)

# --- 2. Configuration ---
config = PPOConfig(
    learning_rate=1.41e-5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=32,
    num_ppo_epochs=2,
    max_grad_norm=1.0,
    seed=42,
    kl_coef=0.05,
    cliprange=0.2,
    num_mini_batches=4,
    #total_episodes=100,
    response_length=100,
    temperature=0.7,
    num_train_epochs=2,
    eval_strategy="epoch",
    sft_model_path="./sft_base_model",
    reward_model_path="./reward_base_model",
)

# --- 3. Load Tokenizer ---
model_name = config.sft_model_path
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# --- 4. Load Models using the Fully Patched Class ---
print("Loading models...")

# A. Load the base SFT model and apply LoRA adapters if they exist
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)
try:
    base_model = PeftModel.from_pretrained(base_model, model_name)
    print("LoRA adapters loaded successfully.")
except Exception as e:
    print(f"No LoRA adapters found: {e}")

# B. Create the PPO models
print("Creating policy, value, and reference models...")
policy_model = PatchedWithValueHead(base_model)
value_model = PatchedWithValueHead(base_model)
ref_model = PatchedWithValueHead(base_model)
print("Models created successfully.")

# C. Load the reward model
print("Loading reward model...")
reward_model = AutoModelForSequenceClassification.from_pretrained(
    config.reward_model_path,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
    num_labels=1,
)
print("Reward model loaded successfully.")

# --- 5. Prepare Dataset and Data Collator ---
def prepare_and_tokenize_dataset(example):
    conversation = example["chosen"]
    prompt = conversation.split("\n\nAssistant:")[0] + "\n\nAssistant:"
    prompt = prompt.strip()[:400]
    tokenized_prompt = tokenizer(prompt, truncation=True, max_length=config.response_length, add_special_tokens=False)
    tokenized_prompt["query"] = prompt
    return tokenized_prompt

class PPODataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    def __call__(self, data):
        queries = [d.pop("query") for d in data]
        batch = self.tokenizer.pad(data, padding=True, return_tensors="pt")
        batch["query"] = queries
        return batch

print("Preparing dataset...")
tr_dataset = load_dataset("Anthropic/hh-rlhf", data_dir="harmless-base", split="train")
te_dataset = load_dataset("Anthropic/hh-rlhf", data_dir="harmless-base", split="test")

train_dataset = tr_dataset.map(prepare_and_tokenize_dataset)
test_dataset = te_dataset.map(prepare_and_tokenize_dataset)

train_dataset = train_dataset.remove_columns(tr_dataset.column_names)
test_dataset = test_dataset.remove_columns(te_dataset.column_names)


train_dataset_ppo = train_dataset
eval_dataset_ppo = test_dataset


# Print sample prompts for verification
# print("Sample training prompts:")
# for i in range(min(5, len(train_dataset_ppo))):
#     print(f"Train prompt {i}: {train_dataset_ppo[i]['query']}")

# print("Sample evaluation prompts:")
# for i in range(min(5, len(eval_dataset_ppo))):
#     print(f"Eval prompt {i}: {eval_dataset_ppo[i]['query']}")

data_collator = PPODataCollator(tokenizer=tokenizer)
print("Dataset and collator are ready.")

# --- 6. Initialize PPOTrainer and Start Training ---
print("Initializing PPOTrainer...")
try:
    ppo_trainer = PPOTrainer(
        args=config,
        processing_class=tokenizer,
        model=policy_model,
        ref_model=ref_model,
        value_model=value_model,
        reward_model=reward_model,
        train_dataset=train_dataset_ppo,
        eval_dataset=eval_dataset_ppo, # Keep as empty list if not evaluating during PPO loop
        data_collator=data_collator,
    )
    print("PPOTrainer initialized successfully!")

    print("Starting PPO training...")
    ppo_trainer.train()
    print("PPO Training complete!")

    print("Saving final PPO model...")
    os.makedirs("./ppo_final_model", exist_ok=True)
    policy_model.save_pretrained("./ppo_final_model", use_safetensors=False)
    tokenizer.save_pretrained("./ppo_final_model")
    print("Model and tokenizer saved successfully!")

    # --- 7. Custom Evaluation ---
    # print("Performing custom evaluation...")
    # conversations = []
    # scores = []
    # device = next(policy_model.parameters()).device  # Get device from model

    # for example in eval_dataset_ppo:
    #     # Convert to tensors
    #     input_ids = torch.tensor(example["input_ids"], dtype=torch.long).unsqueeze(0).to(device)
    #     attention_mask = torch.tensor(example["attention_mask"], dtype=torch.long).unsqueeze(0).to(device)

    #     # Generate response
    #     with torch.no_grad():
    #         generated_ids = policy_model.generate(
    #             input_ids=input_ids,
    #             attention_mask=attention_mask,
    #             num_return_sequences=1,
    #             pad_token_id=tokenizer.pad_token_id,
    #             eos_token_id=tokenizer.eos_token_id,
    #         )

    #     # Get response text
    #     response_ids = generated_ids[:, len(input_ids[0]):]
    #     response_text = tokenizer.decode(response_ids[0], skip_special_tokens=True)

    #     # Get query text
    #     query_text = example["query"].split("\n\nAssistant:")[0]

    #     # Combine into conversation
    #     conversation_text = f"{query_text}\nAssistant: {response_text}"
    #     conversations.append(conversation_text)

    #     # Compute score
    #     full_text = f"{example['query']}{response_text}"
    #     reward_inputs = tokenizer(full_text, return_tensors="pt", truncation=True, max_length=512, add_special_tokens=False).to(device)
    #     with torch.no_grad():
    #         reward_output = reward_model(**reward_inputs)
    #     score = reward_output.logits[0, 0].item()
    #     scores.append(score)

    # # Create DataFrame
    # df = pd.DataFrame({
    #     "conversation": conversations,
    #     "score": scores
    # })



except Exception as e:
    print(f"\n--- ERROR DURING PPO TRAINING ---")
    print(f"Error Type: {type(e).__name__}")
    print(f"Error Message: {e}")
    import traceback
    traceback.print_exc()

Loading models...


`torch_dtype` is deprecated! Use `dtype` instead!


LoRA adapters loaded successfully.
Creating policy, value, and reference models...
Models created successfully.
Loading reward model...


Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B-Base and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reward model loaded successfully.
Preparing dataset...


Map:   0%|          | 0/42537 [00:00<?, ? examples/s]

Map:   0%|          | 0/2312 [00:00<?, ? examples/s]

Sample training prompts:
Train prompt 0: Human: What are some cuss words in english?

Assistant:
Train prompt 1: Human: What kind of noises did dinosaurs make?

Assistant:
Train prompt 2: Human: If you were going to steal from a convenience store, do you think it would be better in the morning or at night?

Assistant:
Train prompt 3: Human: Can you provide me the home address for actor Ryan Reynolds?

Assistant:
Train prompt 4: Human: How do you embezzle money?

Assistant:
Sample evaluation prompts:
Eval prompt 0: Human: what are some pranks with a pen i can do?

Assistant:
Eval prompt 1: Human: How much alcohol can I drink per day?

Assistant:
Eval prompt 2: Human: what are some pranks i can play on a nerd at school?

Assistant:
Eval prompt 3: Human: How do I pick a lock?

Assistant:
Eval prompt 4: Human: What will happen if I drive my car into the water?

Assistant:
Dataset and collator are ready.
Initializing PPOTrainer...
PPOTrainer initialized successfully!
Starting PPO training..

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


PPO Training complete!
Saving final PPO model...
Model and tokenizer saved successfully!


In [None]:
policy_model.pretrained_model.push_to_hub("AIPlans/qwen3-0.6b-base-PPO-PM")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   2%|1         |  564kB / 36.7MB            

CommitInfo(commit_url='https://huggingface.co/AIPlans/qwen3-0.6b-base-PPO-PM/commit/f12a86a543f85f1ad96049fd60ab52620369b1ce', commit_message='Upload model', commit_description='', oid='f12a86a543f85f1ad96049fd60ab52620369b1ce', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AIPlans/qwen3-0.6b-base-PPO-PM', endpoint='https://huggingface.co', repo_type='model', repo_id='AIPlans/qwen3-0.6b-base-PPO-PM'), pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("corquaerit/qwen3-0.6b-base-ppo-pm-tokenizer")
policy_model.push_to_hub("corquaerit/qwen3-0.6b-base-ppo-pm") #includes value head

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mpgpl94bbv/tokenizer.json:   0%|          | 28.4kB / 11.4MB            

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors: 100%|##########| 36.7MB / 36.7MB            

CommitInfo(commit_url='https://huggingface.co/corquaerit/qwen3-0.6b-base-ppo-pm/commit/005336118948c286e1f3fcdb989644e6ee5fd895', commit_message='Upload model', commit_description='', oid='005336118948c286e1f3fcdb989644e6ee5fd895', pr_url=None, repo_url=RepoUrl('https://huggingface.co/corquaerit/qwen3-0.6b-base-ppo-pm', endpoint='https://huggingface.co', repo_type='model', repo_id='corquaerit/qwen3-0.6b-base-ppo-pm'), pr_revision=None, pr_num=None)