#  **Reward Modeling**

In [1]:
#!pip install tyro
#!pip install transformers
#!pip install torch
#!pip install datasets
#!pip install accelerate
#!pip install peft
#!pip install trl
#!pip install peft

## Import necessary modules and packages

In [None]:
# ML tasks
import torch
import tyro

# Data manipulation
import pandas as pd

# Data handling and modeling
from sklearn.model_selection import train_test_split
from dataclasses import dataclass, field
from typing import Optional
from __future__ import annotations
from accelerate import Accelerator


# TRL library for RL
from trl.core import LengthSampler
from trl import (
    RewardConfig,
    RewardTrainer,
    is_xpu_available,
    AutoModelForCausalLMWithValueHead,
    PPOConfig,
    PPOTrainer
)

# Libraries for NLP
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainerCallback,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    GPT2Tokenizer,
    pipeline
)

## Reward Model

In [None]:
@dataclass
class ScriptArguments:
    model_name: str = "facebook/opt-350m"
    """the model name"""
    dataset_name: str = "Anthropic/hh-rlhf"
    """the dataset name"""
    dataset_text_field: str = "text"
    """the text field of the dataset"""
    eval_split: str = "none"
    """the dataset split to evaluate on; default to 'none' (no evaluation)"""
    load_in_8bit: bool = False
    """load the model in 8 bits precision"""
    load_in_4bit: bool = False
    """load the model in 4 bits precision"""
    trust_remote_code: bool = True
    """Enable `trust_remote_code`"""
    reward_config: RewardConfig = field(
        default_factory=lambda: RewardConfig(
            output_dir="output",
            per_device_train_batch_size=8,  #Choosing th batch size as small as possible to be able to train the model
            num_train_epochs=1,
            gradient_accumulation_steps=16,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={"use_reentrant": False},
            learning_rate=1.41e-5,
            report_to="tensorboard",
            remove_unused_columns=False,
            optim="adamw_torch",
            logging_steps=500,
            evaluation_strategy="no",
            max_length=512,
        )
    )


quantization_config = BitsAndBytesConfig(load_in_8bit=ScriptArguments.load_in_8bit, load_in_4bit=ScriptArguments.load_in_4bit)

# Step 1: Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    'facebook/opt-350m',
    num_labels=1,
)


# Step 2: Load the dataset and pre-process it
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
train_dataset = load_dataset("Anthropic/hh-rlhf", split="train[:1%]") #Reducing the size of the training set to be able to train the model


# Tokenize chosen/rejected pairs of inputs
# Adapt this section to your needs for custom datasets
def preprocess_function(examples):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }
    for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
        tokenized_chosen = tokenizer(chosen)
        tokenized_rejected = tokenizer(rejected)

        new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

    return new_examples


# Step 3 :Preprocess the dataset and filter out examples that are longer than ScriptArguments.max_length
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
)

train_dataset = train_dataset.filter(
    lambda x: len(x["input_ids_chosen"]) <= 512
    and len(x["input_ids_rejected"]) <= 512
)


# Step 4: Define the LoraConfig
peft_config = LoraConfig(
            r=16,
            lora_alpha=16,
            bias="none",
            task_type="SEQ_CLS",
            modules_to_save=["scores"],
        )

eval_dataset=None


# Step 5: Define the Trainer
trainer = RewardTrainer(
    model=model,
    tokenizer=tokenizer,
    args= RewardConfig(
            output_dir="output",
            per_device_train_batch_size=8,
            num_train_epochs=1,
            gradient_accumulation_steps=16,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={"use_reentrant": False},
            learning_rate=1.41e-5,
            report_to="tensorboard",
            remove_unused_columns=False,
            optim="adamw_torch",
            logging_steps=500,
            evaluation_strategy="no",
            max_length=512,
        ),
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config
)



Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map (num_proc=4):   0%|          | 0/1608 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1608 [00:00<?, ? examples/s]

## Training Reward Model

In [None]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
Could not estimate the number of tokens of the input, floating-point operations will not be computed


In [None]:
# 0 Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



# 1 Load a pretrained model

model = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
model_ref = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

reward_model = "lvwerra/distilbert-imdb"
reward_pipe = pipeline("sentiment-analysis", reward_model , device=device)



# 2 Initialize trainer

ppo_config = {"batch_size" : 1}
config = PPOConfig(**ppo_config)
ppo_trainer = PPOTrainer(config, model, model_ref, tokenizer)



# 3 Define the query dataset

def build_dataset(tokenizer, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    # load imdb with datasets
    ds = load_dataset(dataset_name, split="train")
    ds = ds.rename_columns({"text": "review"})
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds


dataset = build_dataset(tokenizer)

output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)



# 4 get a batch from the dataset

bs = 16
output_data = dict()
dataset.set_format("pandas")
df_batch = dataset[:].sample(bs)
output_data["query"] = df_batch["query"].tolist()
query_tensors = df_batch["input_ids"].tolist()



# 5 Generate model response

generation_kwargs = {
    "min_length" : -1,
    "top_k" : 0.0,
    "top_p" : 1.0,
    "do_sample" : True,
    "pad_token_id" : tokenizer.eos_token_id,
    "max_new_tokens" : 20,
}

rewards, response_tensors = [], []

for query in output_data['query']:
  query_txt = query
  query_tensor = tokenizer.encode(query_txt, return_tensors="pt").to(model.pretrained_model.device)

  response_tensor = ppo_trainer.generate([item for item in query_tensor],return_prompt=False, **generation_kwargs)
  response_txt = tokenizer.decode(response_tensor[0])

  response_tensors.append(response_txt)

  text = [query_txt + response_txt]

  # Utilize sentiment analysis pipeline
  sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
  pipe_outputs = sentiment_pipe(text)

  # Conpute the rewards
  reward = [torch.tensor(output["score"]) for output in pipe_outputs]
  rewards.append(reward)


output_data["response (RLHF)"] = response_tensors
output_data["scores (RLHF)"] = rewards



# 6 Store results in a dataframe and display them

df_ppo_results = pd.DataFrame(output_data)
df_ppo_results

df_ppo_results


Unnamed: 0,query,response (RLHF),scores (RLHF)
0,Ossessione is,Involves you to continue replaying a high lev...,[tensor(0.9749)]
1,...this,shows you that you are able to reproduce your...,[tensor(0.9876)]
2,"The film,","based on popular sources, is based on a story...",[tensor(0.9276)]
3,Now I don't hate,"this talk, Alan, who you may know.)\n\nIn the...",[tensor(0.6281)]
4,I never thought I see,such a mood or arrogant man sitting there wit...,[tensor(0.9593)]
5,What can,"you say about it?"" A police agent says when E...",[tensor(0.5198)]
6,I have to,"to wait, 'how convenient is it?',"" she added,...",[tensor(0.5821)]
7,I really loved,it to six to seven then when we ran into him ...,[tensor(0.9797)]
8,I've,"seen the same,"" said Birtley of Kelly's recru...",[tensor(0.8294)]
9,I would have rated,it 5 out of 5-stars. Thank You. (5 of 5 stars...,[tensor(0.9565)]
