# **Optimization with Proximal Policy Optimization**

In [None]:
#!pip install tyro
#!pip install transformers
#!pip install torch
#!pip install datasets
#!pip install accelerate
#!pip install peft
#!pip install trl
#!pip install peft

## Import necessary modules and packages

In [None]:
# ML tasks
import torch
import tyro

# Data manipulation
import pandas as pd

# Data handling and modeling
from sklearn.model_selection import train_test_split
from dataclasses import dataclass, field
from typing import Optional
from __future__ import annotations
from accelerate import Accelerator


# TRL library for RL
from trl.core import LengthSampler
from trl import (
    RewardConfig,
    RewardTrainer,
    is_xpu_available,
    AutoModelForCausalLMWithValueHead,
    PPOConfig,
    PPOTrainer
)

# Libraries for NLP
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainerCallback,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    GPT2Tokenizer,
    pipeline
)

## PPO Simple Implementation :  One query


In [None]:
# 1 Load a pretrained model

model = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
model_ref = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

reward_model = "lvwerra/distilbert-imdb"
reward_pipe = pipeline("sentiment-analysis", reward_model , device=device)

model.cuda()
model_ref.cuda()



# 2 Initialize trainer

ppo_config = {"batch_size" : 1}
config = PPOConfig(**ppo_config)
ppo_trainer = PPOTrainer(config, model, model_ref, tokenizer)



# 3 Encode a query

query_txt = "I love ice"
query_tensor = tokenizer.encode(query_txt, return_tensors="pt").to(model.pretrained_model.device)



# 4 Generate model response

generation_kwargs = {
    "min_length" : -1,
    "top_k" : 0.0,
    "top_p" : 1.0,
    "do_sample" : True,
    "pad_token_id" : tokenizer.eos_token_id,
    "max_new_tokens" : 20,
}

response_tensor = ppo_trainer.generate([item for item in query_tensor],return_prompt=False, **generation_kwargs)
response_txt = tokenizer.decode(response_tensor[0])



# 5 Define a reward for response

text = [query_txt + response_txt]

# Use sentiment analysis pipeline
sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
pipe_outputs = sentiment_pipe(text)

# Extract sentiment score
reward = [torch.tensor(output["score"]) for output in pipe_outputs]


rest_ppo = dict()
rest_ppo["query"] = query_txt
rest_ppo["response (RLHF)"] = response_txt
rest_ppo["scores (RLHF)"] = reward



# 6 store results in a dataframe
df_ppo_results = pd.DataFrame(res
                              t_ppo)
df_ppo_results

df_ppo_results

## Apply the method on several querys

In [None]:
# 0 Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



# 1 Load a pretrained model

model = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
model_ref = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

reward_model = "lvwerra/distilbert-imdb"
reward_pipe = pipeline("sentiment-analysis", reward_model , device=device)



# 2 Initialize trainer

ppo_config = {"batch_size" : 1}
config = PPOConfig(**ppo_config)
ppo_trainer = PPOTrainer(config, model, model_ref, tokenizer)



# 3 Define the query dataset

def build_dataset(tokenizer, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    # load imdb with datasets
    ds = load_dataset(dataset_name, split="train")
    ds = ds.rename_columns({"text": "review"})
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds


dataset = build_dataset(tokenizer)

output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)



# 4 get a batch from the dataset

bs = 16
output_data = dict()
dataset.set_format("pandas")
df_batch = dataset[:].sample(bs)
output_data["query"] = df_batch["query"].tolist()
query_tensors = df_batch["input_ids"].tolist()



# 5 Generate model response

generation_kwargs = {
    "min_length" : -1,
    "top_k" : 0.0,
    "top_p" : 1.0,
    "do_sample" : True,
    "pad_token_id" : tokenizer.eos_token_id,
    "max_new_tokens" : 20,
}

rewards, response_tensors = [], []

for query in output_data['query']:
  query_txt = query
  query_tensor = tokenizer.encode(query_txt, return_tensors="pt").to(model.pretrained_model.device)

  response_tensor = ppo_trainer.generate([item for item in query_tensor],return_prompt=False, **generation_kwargs)
  response_txt = tokenizer.decode(response_tensor[0])

  response_tensors.append(response_txt)

  text = [query_txt + response_txt]

  # Utilize sentiment analysis pipeline
  sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
  pipe_outputs = sentiment_pipe(text)

  # Conpute the rewards
  reward = [torch.tensor(output["score"]) for output in pipe_outputs]
  rewards.append(reward)


output_data["response (RLHF)"] = response_tensors
output_data["scores (RLHF)"] = rewards



# 6 Store results in a dataframe and display them

df_ppo_results = pd.DataFrame(output_data)
df_ppo_results

df_ppo_results