<a href="https://colab.research.google.com/github/AashiDutt/RLHF-projects/blob/main/RLHF_Finetuning_Toxicity_Evaluator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install peft
!pip install TRL

Collecting peft
  Downloading peft-0.7.1-py3-none-any.whl (168 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate, peft
Successfully installed accelerate-0.25.0 peft-0.7.1
Collecting TRL
  Downloading trl-0.7.7-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from TRL)
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tyro>=0.5.11 (from TRL)
  Downloading tyro-0.6.2-py3-none-any.whl (78 kB)
[

In [None]:
!pip install evaluate # HF library

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification,AutoModelForSeq2SeqLM, GenerationConfig
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType

# TRL : Transformer Reinforcement learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

import evaluate
import torch

from tqdm import tqdm
tqdm.pandas()

In [None]:
# Load FLAN T5 model

model_name = "google/flan-t5-base"
huggingface_dataset_name = "knkarthick/dialogsum"

dataset_original = load_dataset(huggingface_dataset_name)
dataset_original

Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [None]:
def build_dataset(model_name, dataset_name, input_min_text_length, input_max_text_length):
  dataset = load_dataset(dataset_name, split = "train")
  dataset = dataset.filter(lambda x:len(x["dialogue"])> input_min_text_length and len(x["dialogue"])<= input_max_text_length, batched = False)
  tokenizer = AutoTokenizer.from_pretrained(model_name, device_map = "auto")

  def tokenize(sample):
    # wrap each dialogue with insrtuction
    prompt = f"""
    Summarize the following conversation.
    {sample["dialogue"]}
    Summary:
    """
    sample['input_ids'] = tokenizer.encode(prompt)
    sample['query'] = tokenizer.decode(sample["input_ids"]) # "query" is requirement for PPO library
    return sample

  # tokenize each dialogue
  dataset = dataset.map(tokenize, batched = False)
  dataset.set_format(type = "torch")

  # split dataset into train and test
  dataset_splits = dataset.train_test_split(test_size = 0.2, shuffle = False, seed = 42)
  return dataset_splits


dataset = build_dataset(model_name = model_name, dataset_name = huggingface_dataset_name, input_min_text_length = 200, input_max_text_length= 1000 )




Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Map:   0%|          | 0/10022 [00:00<?, ? examples/s]

In [None]:
def print_number_of_trainable_model_parameters(model):
  trainable_model_params = 0
  all_model_params = 0

  for _, param in model.named_parameters():
    all_model_params += param.numel()
    if param.requires_grad:
      trainable_model_params += param.numel()
  return f"\ntrainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters"

In [None]:
# ------------------------------------------------DO NOT RUN THIS CELL WITHOUT S3 CHECKPOINT------------------------------------------------

# LoRA: Low Rank Adaption of LLMs r --> rank
lora_config = LoraConfig(r = 32, lora_alpha = 32, target_modules = ['q', 'v'], lora_dropout = 0.05, bias = "none", task_type = TaskType.SEO_2_SEO_LM)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype = torch.bfloat16)
peft_model = PeftModel.from_pretrained(model, "S3 CHECKPOINT", lora_config = lora_config, torch_dtype = torch.bfloat16, device_map = "auto", is_trainable = True)

print(f'PEFT model parameters to be updated:\n{print_number_of_trainable_model_parameters(peft_model)}\n')

In [None]:
# ------------------------------------------------DO NOT RUN THIS CELL WITHOUT S3 CHECKPOINT------------------------------------------------

ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(peft_model, torch_dtype = torch.bfloat16, is_trainable = True)
print(f'PPO model parameters to be updated (ValueHead + 769 params):\n{print_number_of_trainable_model_parameters(ppo_model)}\n')

print(ppo_model.v_head)


In [None]:
# Setting up reward model -- based on binary classifier

toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name, device_map = "auto")
toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name, device_map = "auto")
print(toxicity_model.config.id2label)

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

{0: 'nothate', 1: 'hate'}


In [None]:
# Example

non_toxic_text = "I want to kiss you"
toxicity_input_ids = toxicity_tokenizer(non_toxic_text, return_tensors = "pt").input_ids
logits = toxicity_model(input_ids = toxicity_input_ids).logits # logits are passed to PPO
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

# pass logits to softmax to get probabilities
probabilities = logits.softmax(dim = -1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

# get logits for "not hate" -- this is the reward
not_hate_index = 0
nothate_reward = (logits[:, not_hate_index]).tolist()
print(f'reward (high): {nothate_reward}')


logits [not hate, hate]: [4.657958030700684, -4.078615188598633]
probabilities [not hate, hate]: [0.9998394250869751, 0.000160577503265813]
reward (high): [4.657958030700684]


In [None]:
toxic_text = "You are disgusting and terrible and i dang hate you"
toxicity_input_ids = toxicity_tokenizer(non_toxic_text, return_tensors = "pt").input_ids
logits = toxicity_model(input_ids = toxicity_input_ids).logits # logits are passed to PPO
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

# pass logits to softmax to get probabilities
probabilities = logits.softmax(dim = -1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

# get logits for "not hate" -- this is the reward
nothate_reward = (logits[:, not_hate_index]).tolist()
print(f'reward (high): {nothate_reward}')


logits [not hate, hate]: [4.657958030700684, -4.078615188598633]
probabilities [not hate, hate]: [0.9998394250869751, 0.000160577503265813]
reward (high): [4.657958030700684]


In [None]:
device = 0 if torch.cuda.is_available() else "cpu"
sentiment_pipe = pipeline("sentiment-analysis", model = toxicity_model_name, device = device)
reward_logits_kwargs = {
    "top_k": None,
    "function_to_apply": "none",
    "batch_size": 16
}

reward_probabilities_kwargs = {
    "top_k": None,
    "function_to_apply": "softmax",
    "batch_size": 16
}

print("Reward model output for non-toxic text:")
print(sentiment_pipe(non_toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(non_toxic_text, **reward_probabilities_kwargs))
print("\nReward model output for toxic text:")
print(sentiment_pipe(toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(toxic_text, **reward_logits_kwargs))

Reward model output for non-toxic text:
[{'label': 'nothate', 'score': 4.657958030700684}, {'label': 'hate', 'score': -4.078615188598633}]
[{'label': 'nothate', 'score': 0.9998394250869751}, {'label': 'hate', 'score': 0.00016057751781772822}]

Reward model output for toxic text:
[{'label': 'nothate', 'score': 0.7720897197723389}, {'label': 'hate', 'score': -0.9174708724021912}]
[{'label': 'nothate', 'score': 0.7720897197723389}, {'label': 'hate', 'score': -0.9174708724021912}]


In [None]:
# Evaluate toxicity - to reduce the toxicity of generated response

toxicity_evaluator = evaluate.load("toxicity", toxicity_model_name, module_type = "measurement", toxic_label = "hate")


Downloading builder script:   0%|          | 0.00/6.08k [00:00<?, ?B/s]

In [None]:
toxicity_score = toxicity_evaluator.compute(predictions = [
    non_toxic_text
])

print("Toxicity score for non toxic text: ")
print(toxicity_score["toxicity"])

toxicity_score = toxicity_evaluator.compute(predictions = [
    toxic_text
])

print("\nToxicity score for toxic text: ")
print(toxicity_score["toxicity"])

Toxicity score for non toxic text: 
[0.00016057751781772822]

Toxicity score for toxic text: 
[0.15583361685276031]


In [None]:
# Helper Function for toxicity calculation with mean and std

def evaluate_toxicity(model, toxicity_evaluator, tokenizer, dataset, num_samples):
  max_new_tokens = 100
  toxicities = []
  input_texts = []

  for i, sample in tqdm(enumerate(dataset)):
    input_text = sample["query"]

    if i > num_samples:
      break
    input_ids = tokenizer(input_text, return_tensors = "pt", padding = True).input_ids
    generation_config = GenerationConfig(max_new_tokens =max_new_tokens, top_k = 0.0, top_p = 1.0, do_sample = True)
    response_token_ids = model.generate(input_ids = input_ids, generation_config = generation_config)
    generated_text = tokenizer.decode(response_token_ids[0], skip_special_tokens = True)
    toxicity_score = toxicity_evaluator.compute(predictions = [(input_text + " " + generated_text)])
    toxicities.extend(toxicity_score["toxicity"])

  mean = np.mean(toxicities)
  std = np.std(toxicities)

  return mean, std


In [None]:
# ------------------------------------------------DO NOT RUN WITHOUT AWS S3 ------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map = "auto")
mean_before_detox, std_before_detox = evaluate_toxicity(model = ppo_model,
                                                        toxicity_evaluator = toxicity_evaluator,
                                                        tokenizer = tokenizer,
                                                        dataset= dataset["test"],
                                                        num_samples = 10
                                                )
print(f'toxicity [mean, std] before detox: [{mean_before_detox,std_before_detox}]')

In [None]:
# ------------------------------------------------DO NOT RUN WITHOUT AWS S3 ------------------------------------------------
# Performing Fine tuning to detox the hate summaries

ref_model = create_reference_model(ppo_model)
print(f'Reference model paarmeters to be updated:\n{print_number_of_trainable_model_parameters(ref_model)}\n')

In [None]:
# ------------------------------------------------DO NOT RUN WITHOUT AWS S3 ------------------------------------------------

learning_rate = 1.41e-5
max_ppo_epochs = 1
mini_batch_size = 4
batch_size = 16

config = PPOConfig(
    model_name = model_name,
    learning_rate = learning_rate ,
    ppo_eepochs = max_ppo_epochs,
    mini_batch_size = mini_batch_size,
    batch_size = batch_size
)

def collator(data):
  return dict((key, [d[key] fort d in data]) for key in data[0])

ppo_trainer = PPOTrainer(config = config,
                         model = ppo_model,
                         ref_model = ref_model,
                         tokenizer = tokenizer,
                         dataset = dataset["train"],
                         data_collator = collator)

In [None]:
#