# **Optimization with Proximal Policy Optimization**

In [1]:
#!pip install tyro
#!pip install transformers
#!pip install torch
#!pip install datasets
#!pip install accelerate
#!pip install peft
#!pip install trl
#!pip install peft

Collecting tyro
  Downloading tyro-0.6.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m970.9 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting docstring-parser>=0.14.1 (from tyro)
  Downloading docstring_parser-0.15-py3-none-any.whl (36 kB)
Collecting shtab>=1.5.6 (from tyro)
  Downloading shtab-1.6.5-py3-none-any.whl (13 kB)
Installing collected packages: shtab, docstring-parser, tyro
Successfully installed docstring-parser-0.15 shtab-1.6.5 tyro-0.6.2
Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [

## Import necessary modules and packages

In [2]:
# ML tasks
import torch
import tyro

# Data manipulation
import pandas as pd

# Data handling and modeling
from sklearn.model_selection import train_test_split
from dataclasses import dataclass, field
from typing import Optional
from __future__ import annotations
from accelerate import Accelerator


# TRL library for RL
from trl.core import LengthSampler
from trl import (
    RewardConfig,
    RewardTrainer,
    is_xpu_available,
    AutoModelForCausalLMWithValueHead,
    PPOConfig,
    PPOTrainer
)

# Libraries for NLP
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainerCallback,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    GPT2Tokenizer,
    pipeline
)

## PPO Simple Implementation :  One query


In [None]:
# 0 Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1 Load a pretrained model

model = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
model_ref = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

reward_model = "lvwerra/distilbert-imdb"
reward_pipe = pipeline("sentiment-analysis", reward_model , device=device)

model.cuda()
model_ref.cuda()



# 2 Initialize trainer

ppo_config = {"batch_size" : 1}
config = PPOConfig(**ppo_config)
ppo_trainer = PPOTrainer(config, model, model_ref, tokenizer)



# 3 Encode a query

query_txt = "I love ice"
query_tensor = tokenizer.encode(query_txt, return_tensors="pt").to(model.pretrained_model.device)



# 4 Generate model response

generation_kwargs = {
    "min_length" : -1,
    "top_k" : 0.0,
    "top_p" : 1.0,
    "do_sample" : True,
    "pad_token_id" : tokenizer.eos_token_id,
    "max_new_tokens" : 20,
}

response_tensor = ppo_trainer.generate([item for item in query_tensor],return_prompt=False, **generation_kwargs)
response_txt = tokenizer.decode(response_tensor[0])



# 5 Define a reward for response

text = [query_txt + response_txt]

# Use sentiment analysis pipeline
sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
pipe_outputs = sentiment_pipe(text)

# Extract sentiment score
reward = [torch.tensor(output["score"]) for output in pipe_outputs]


rest_ppo = dict()
rest_ppo["query"] = query_txt
rest_ppo["response (RLHF)"] = response_txt
rest_ppo["scores (RLHF)"] = reward



# 6 store results in a dataframe
df_ppo_results = pd.DataFrame(rest_ppo)
df_ppo_results

## Apply the method on several querys

In [3]:
# 0 Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



# 1 Load a pretrained model

model = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
model_ref = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

reward_model = "lvwerra/distilbert-imdb"
reward_pipe = pipeline("sentiment-analysis", reward_model , device=device)



# 2 Initialize trainer

ppo_config = {"batch_size" : 1}
config = PPOConfig(**ppo_config)
ppo_trainer = PPOTrainer(config, model, model_ref, tokenizer)



# 3 Define the query dataset

def build_dataset(tokenizer, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    # load imdb with datasets
    ds = load_dataset(dataset_name, split="train")
    ds = ds.rename_columns({"text": "review"})
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds


dataset = build_dataset(tokenizer)

output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)



# 4 get a batch from the dataset

bs = 16
output_data = dict()
dataset.set_format("pandas")
df_batch = dataset[:].sample(bs)
output_data["query"] = df_batch["query"].tolist()
query_tensors = df_batch["input_ids"].tolist()



# 5 Generate model response

generation_kwargs = {
    "min_length" : -1,
    "top_k" : 0.0,
    "top_p" : 1.0,
    "do_sample" : True,
    "pad_token_id" : tokenizer.eos_token_id,
    "max_new_tokens" : 20,
}

rewards, response_tensors = [], []

for query in output_data['query']:
  query_txt = query
  query_tensor = tokenizer.encode(query_txt, return_tensors="pt").to(model.pretrained_model.device)

  response_tensor = ppo_trainer.generate([item for item in query_tensor],return_prompt=False, **generation_kwargs)
  response_txt = tokenizer.decode(response_tensor[0])

  response_tensors.append(response_txt)

  text = [query_txt + response_txt]

  # Utilize sentiment analysis pipeline
  sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
  pipe_outputs = sentiment_pipe(text)

  # Conpute the rewards
  reward = [torch.tensor(output["score"]) for output in pipe_outputs]
  rewards.append(reward)


output_data["response (RLHF)"] = response_tensors
output_data["scores (RLHF)"] = rewards



# 6 Store results in a dataframe and display them

df_ppo_results = pd.DataFrame(output_data)
df_ppo_results

df_ppo_results

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating unsupervised split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/24895 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1168 > 1024). Running this sequence through the model will result in indexing errors


Unnamed: 0,query,response (RLHF),scores (RLHF)
0,I found Horrorvision almost unw,ieldy. A cursory look around featured a sturdy...,[tensor(0.9488)]
1,I want to state first,", eventually that I'm very confident when I ta...",[tensor(0.9534)]
2,I have,"learned something very important, from my exp...",[tensor(0.9835)]
3,"Intelligent, wry","aura mage, like Icecrown. ] --- magma blue my...",[tensor(0.8610)]
4,I saw the German version of the,"playoff game here. So it was only natural, th...",[tensor(0.8664)]
5,"Great actors, an o",mbre nurse and upstanding police officers and ...,[tensor(0.9501)]
6,A terrible storyline,to the failures of the neoconservative Destru...,[tensor(0.9951)]
7,...that Jamie Foxx would,be tweeting six hours before then when at 2pm...,[tensor(0.5658)]
8,'Inter,"sectionalistical Analysis,"" IL-11-ClinicalTria...",[tensor(0.7776)]
9,I just don't see,"why there should be cat-dust.""\n\nKaley, 57, ...",[tensor(0.9519)]


## Dataset to csv

In [5]:
df_ppo_results.to_csv('df_ppo_results.csv', index=False, sep=';')