<a href="https://colab.research.google.com/github/PanoEvJ/summarization_RLHF/blob/main/rlhf_PPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q -U git+https://github.com/lvwerra/trl.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U git+https://github.com/huggingface/peft.git

!pip install -q transformers==4.30
!pip install -q -U sentencepiece
!pip install -q huggingface_hub
!pip install -q tdqm torch>=0.3.0
!pip install -q -U bitsandbytes
!pip install -q -U wandb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.7 MB/s[0m eta [36

In [3]:
import os
import random
import wandb

from dataclasses import dataclass, field
from typing import Optional

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset as TorchDataset
from datasets import load_dataset, Dataset
from peft import (AutoPeftModelForCausalLM,
                  LoraConfig,
                  PeftConfig,
                  PeftModel,
                  TaskType
)
from tqdm import tqdm
from transformers import (AutoModelForSeq2SeqLM,
                          AutoModelForSequenceClassification,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          AutoTokenizer,
                          T5ForConditionalGeneration,
                          T5Tokenizer
)
from trl import (SFTTrainer,
                 PPOConfig,
                 PPOTrainer,
                 AutoModelForSeq2SeqLMWithValueHead,
                 create_reference_model,
                 set_seed
)
from trl.core import LengthSampler

In [None]:
from datasets import get_dataset_split_names

dataset_name = 'CarperAI/openai_summarize_comparisons'
get_dataset_split_names(dataset_name)

dataset = load_dataset('CarperAI/openai_summarize_comparisons', split='train').shuffle(seed=42)
ppo_dataset = Dataset(dataset['prompt'])

In [None]:
len(dataset)

In [None]:
# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=True,
#     bnb_8bit_use_double_quant=True,
#     bnb_8bit_quant_type="nf8",
#     bnb_8bit_compute_dtype=torch.bfloat16,
# )

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    # target_modules=["q", "k", "v"],
    bias="none",
    task_type="SEQ_2_SEQ_LM",
    inference_mode=False,
)

In [None]:
ref_model_id = 't5-base'
policy_model_id = 'PanoEvJ/T5_base_SFT_summarization'

ref_model = AutoModelForSeq2SeqLM.from_pretrained(ref_model_id,
                                                  device_map="auto",
                                                  load_in_8bit=True,
                                                  # quantization_config=bnb_config,
                                                 )
policy_model = AutoModelForSeq2SeqLM.from_pretrained(policy_model_id,
                                                     device_map="auto",
                                                     load_in_8bit=True,
                                                    #  quantization_config=bnb_config,
                                                    #  peft_config=peft_config
                                                    )

In [None]:
# policy_model = PeftModel(policy_model, peft_config)

In [None]:
# see the available modules by printint out the model
print(policy_model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(ref_model)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(ref_model_id,
                                          model_max_length=512,
                                          truncation=True,
                                          padding=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
reward_model = AutoModelForSequenceClassification.from_pretrained("JuanKO/rlhf")
reward_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
ppo_config = PPOConfig(
    steps=512,
    model_name=policy_model,
    learning_rate=1e-4,
    batch_size=128,
    mini_batch_size=16,
    gradient_accumulation_steps=1,
    optimize_cuda_cache=True,
    ppo_epochs=8,
    target_kl=0.1
)

In [81]:
ppo_trainer = PPOTrainer(
    config= ppo_config,
    model=policy_model,
    ref_model=ref_model,
    tokenizer=tokenizer,
    dataset=dataset,
)
dummy_dataloader = ppo_trainer.dataloader

ValueError: ignored

In [None]:
for query_tensor, response_tensor in dummy_dataloader:
    # define a reward for response
    # (this could be any reward such as human feedback or output from another model)
    reward = [torch.tensor(1.0), torch.tensor(0.0)]
    # train model
    train_stats = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward)
    break

In [None]:
    question_tensors = batch["input_ids"]

    response_tensors = ppo_trainer.generate(
        question_tensors,
        return_prompt=False,
        length_sampler=output_length_sampler,
        **generation_kwargs,
    )
    batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)

    # Compute reward score (using the sentiment analysis pipeline)
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[0]["score"] - script_args.reward_baseline) for output in pipe_outputs]

In [None]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    if epoch >= config.total_ppo_epochs:
        break


In [None]:
# Run PPO step
stats = ppo_trainer.step(question_tensors, response_tensors, rewards)

In [None]:
# Log stats to Wandb
ppo_trainer.log_stats(stats, batch, rewards)

In [None]:
from google.colab import drive

drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
model = AutoModelForSequenceClassification.from_pretrained("./rewards_model/")
tokenizer = AutoTokenizer.from_pretrained("./rewards_model/")

NameError: ignored