# Fine-Tune with Reinforcement Learning (PPO) and PEFT to Generate Less-Toxic Summaries


* We mark **TODO** in the notebook cells to indicate the place where you need to complete the missing code. You can refer to the exercises in the course repository for code examples.

In [None]:
# Install necessary packages
%pip install --upgrade transformers huggingface_hub peft accelerate bitsandbytes datasets trl==0.11.4 ipywidgets evaluate tqdm

In [None]:
%env HF_HOME=/opt/notebooks/.cache/huggingface

In [None]:
# or use an input box on this notebook to copy/paste the token
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig, Trainer, TrainingArguments
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model

# trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead, SFTTrainer, SFTConfig
from trl import create_reference_model
from trl.core import LengthSampler

import torch
import evaluate

import numpy as np
import pandas as pd

# tqdm library makes the loops show a smart progress meter.
from tqdm import tqdm
tqdm.pandas()

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"using device: {device}")

## Load FLAN-T5 Model, Prepare Reward Model and Toxicity Evaluator

In [None]:
model_name="google/flan-t5-base"
huggingface_dataset_name = "knkarthick/dialogsum"

dataset_original = load_dataset(huggingface_dataset_name)

dataset_original

In [None]:
def build_dataset(model_name,
                  dataset_name,
                  input_min_text_length, 
                  input_max_text_length):

    # load dataset (only "train" part will be enough for this lab).
    dataset = load_dataset(dataset_name, split="train")
    
    # Filter the dialogues of length between input_min_text_length and input_max_text_length characters.
    dataset = dataset.filter(lambda x: len(x["dialogue"]) > input_min_text_length and len(x["dialogue"]) <= input_max_text_length, batched=False)

    # Prepare tokenizer. Setting device_map="auto" allows to switch between GPU and CPU automatically.
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")
    
    def tokenize(sample):
        
        # Wrap each dialogue with the instruction.
        prompt = f"""
Summarize the following conversation.

{sample["dialogue"]}

Summary:
"""
        sample["input_ids"] = tokenizer.encode(prompt)
        
        # This must be called "query", which is a requirement of our PPO library.
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    # Tokenize each dialogue.
    dataset = dataset.map(tokenize, batched=False)
    dataset.set_format(type="torch")
    
    # Split the dataset into train and test parts.
    dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)

    return dataset_splits

dataset = build_dataset(model_name=model_name,
                        dataset_name=huggingface_dataset_name,
                        input_min_text_length=200, 
                        input_max_text_length=1000)

print(dataset)

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"\ntrainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

## Model Fine-Tuning

In [None]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

#TODO: create tokenizer using AutoTokenizer class
#NOTE: you need to set device_map argument properly to choose device
# tokenizer = ...

#TODO: create model using AutoModelForSeq2SeqLM class
# model = ...

# create PEFT model for fine-tuning
peft_model = get_peft_model(model, lora_config)

print(f'PEFT model parameters to be updated:\n{print_number_of_trainable_model_parameters(peft_model)}\n')

def process_dataset(batch):
    prompt = [f'Summarize the following conversation:\n{dialogue}\n\nSummary:\n{summary}\n' for dialogue, summary in zip(batch['dialogue'], batch['dialogue'])]
    batch['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    batch['labels'] = tokenizer(batch["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    return batch

processed_dataset = dataset_original.map(process_dataset, batched=True)

output_dir = "peft-dialogue-finetuned"

#TODO: create trainer using SFTTrainer class
# trainer = SFTTrainer(...)

trainer.train()

peft_model_path="./peft-dialogue-summary-checkpoint"

trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)


In [None]:
peft_model_path="./peft-dialogue-summary-checkpoint"

ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(peft_model_path,                                                               
                                                               torch_dtype=torch.bfloat16,
                                                               device_map="auto",
                                                               is_trainable=True)

print(f'PPO model parameters to be updated (ValueHead + 769 params):\n{print_number_of_trainable_model_parameters(ppo_model)}\n')
print(ppo_model.v_head)

## Setup Reward Model

![](img/hf_facebook_hatespeec_reward_model.png)

In [None]:
toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"

#TODO: create toxicity_tokenizer
#toxicity_tokenizer = ...

#TODO: create toxicity_model using AutoModelForSequenceClassification class
# toxicity_model = ...

print(toxicity_model.config.id2label)

![](img/rlhf_reward_model_binary_classifier.png)

In [None]:
non_toxic_text = "You are a great person and I like you"

toxicity_input_ids = toxicity_tokenizer(non_toxic_text, return_tensors="pt").input_ids.to(device)

#TODO: perform model inference on the input tokens
#TODO: and capture the logits (the outputs from the last level of the neural network)
#NOTE: please refer to the Toxicity_Detector_by_Meta.ipynb notebook (https://github.com/ACANETS/genai-labs/blob/main/Toxicity_Detector_by_Meta.ipynb)
# logits = ...
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

#TODO: Print the probabilities for [not hate, hate]
#TODO: please refer to the Toxicity_Detector_by_Meta.ipynb notebook (https://github.com/ACANETS/genai-labs/blob/main/Toxicity_Detector_by_Meta.ipynb)
# probabilities = ...
print(f'probabilities [not hate, hate]: {probabilities}')

# get the logits for "not hate" - this is the reward!
# TODO: please refer to the Toxicity_Detector_by_Meta.ipynb notebook (https://github.com/ACANETS/genai-labs/blob/main/Toxicity_Detector_by_Meta.ipynb)
# not_hate_index = ...
# nothate_reward = ...
print(f'reward (high): {nothate_reward}')

In [None]:
toxic_text = "You are disgusting and terrible and i damn hate you"

#TODO: tokenize the toxic text
# toxicity_input_ids = ...

#TODO: perform model inference on the input tokens
#TODO: and capture the logits (the outputs from the last level of the neural network)
#NOTE: please refer to the Toxicity_Detector_by_Meta.ipynb notebook (https://github.com/ACANETS/genai-labs/blob/main/Toxicity_Detector_by_Meta.ipynb)
# logits = ...
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

#TODO: Print the probabilities for [not hate, hate]
#TODO: please refer to the Toxicity_Detector_by_Meta.ipynb notebook (https://github.com/ACANETS/genai-labs/blob/main/Toxicity_Detector_by_Meta.ipynb)
# probabilities = ...
print(f'probabilities [not hate, hate]: {probabilities}')

# get the logits for "not hate" - this is the reward!
# TODO: please refer to the Toxicity_Detector_by_Meta.ipynb notebook (https://github.com/ACANETS/genai-labs/blob/main/Toxicity_Detector_by_Meta.ipynb)
# not_hate_index = ...
# nothate_reward = ...
print(f'reward (high): {nothate_reward}')

In [None]:
sentiment_pipe = pipeline("sentiment-analysis", 
                          model=toxicity_model_name,
                          tokenizer=toxicity_tokenizer,
                          max_length=512,
                          truncation=True,
                          device=device)
reward_logits_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "none", # Set to "none" to retrieve raw logits.
    "batch_size": 16
}

reward_probabilities_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "softmax", # Set to "softmax" to apply softmax and retrieve probabilities.
    "batch_size": 16
}

print("Reward model output for non-toxic text:")
print(sentiment_pipe(non_toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(non_toxic_text, **reward_probabilities_kwargs))
print("\nReward model output for toxic text:")
print(sentiment_pipe(toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(toxic_text, **reward_probabilities_kwargs))

In [None]:
print(sentiment_pipe(non_toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(non_toxic_text, **reward_probabilities_kwargs))

In [None]:
print(sentiment_pipe(toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(toxic_text, **reward_probabilities_kwargs))

## Evaluate Toxicity

In [None]:
import evaluate

#TODO: create toxicity_evaluator using evaluate.load()
#NOTE: please refer to exercise Toxicity_Detector_by_Meta.ipynb
# toxicity_evaluator = ...

In [None]:
toxicity_score = toxicity_evaluator.compute(predictions=[
    non_toxic_text
])

print("Toxicity score for non-toxic text:")
print(toxicity_score["toxicity"])

toxicity_score = toxicity_evaluator.compute(predictions=[
    toxic_text
])

print("\nToxicity score for toxic text:")
print(toxicity_score["toxicity"])

In [None]:
def evaluate_toxicity(model, 
                      toxicity_evaluator, 
                      tokenizer, 
                      dataset, 
                      num_samples):

    max_new_tokens=100

    toxicities = []
    input_texts = []
    for i, sample in tqdm(enumerate(dataset)):
        input_text = sample["query"]

        if i > num_samples:
            break
            
        input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids.to(device)
        
        generation_config = GenerationConfig(max_new_tokens=max_new_tokens,
                                             tok_k=0.0,
                                             top_p=1.0,
                                             do_sample=True)

        response_token_ids = model.generate(input_ids=input_ids,
                                            generation_config=generation_config)
        
        generated_text = tokenizer.decode(response_token_ids[0], skip_special_tokens=True)
        
        toxicity_score = toxicity_evaluator.compute(predictions=[(input_text + " " + generated_text)])

        toxicities.extend(toxicity_score["toxicity"])

    # TODO: Compute mean & std using numpy functions.
    # mean = ...
    # std = ...
        
    return mean, std

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")


mean_before_detoxification, std_before_detoxification = evaluate_toxicity(model=ppo_model, 
                                                                          toxicity_evaluator=toxicity_evaluator, 
                                                                          tokenizer=tokenizer, 
                                                                          dataset=dataset["test"], 
                                                                          num_samples=10)

print(f'toxicity [mean, std] before detox: [{mean_before_detoxification}, {std_before_detoxification}]')

## Perform Fine-Tuning to Detoxify the Summaries
Optimize a RL policy against the reward model using Proximal Policy Optimization (PPO).

In [None]:
#TODO: create a refenence model to be used as a frozen model
# ref_model = ...

print(f'Reference model parameters to be updated:\n{print_number_of_trainable_model_parameters(ref_model)}\n')

![](img/rlhf_kl_divergence.png)

In [None]:
from trl import PPOConfig, PPOTrainer

learning_rate=1.41e-5
max_ppo_epochs=1
mini_batch_size=4
batch_size=32

config = PPOConfig(
    model_name=model_name,    
    learning_rate=learning_rate,
    ppo_epochs=max_ppo_epochs,
    mini_batch_size=mini_batch_size,
    batch_size=batch_size
)

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

#TODO: create ppo_trainer using PPOTrainer class
# ppo_trainer = ...

### Fine-Tune the Model

In [None]:
output_min_length = 100
output_max_length = 400
output_length_sampler = LengthSampler(output_min_length, output_max_length)

generation_kwargs = {
    "min_length": 5,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True
}

reward_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "none", # You want the raw logits without softmax.
    "batch_size": 32
}

max_ppo_steps = 10

for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    # Break when you reach max_steps.
    if step >= max_ppo_steps:
        break   

    prompt_tensors = batch["input_ids"]

    # Get response from FLAN-T5/PEFT LLM.
    summary_tensors = []

    for prompt_tensor in prompt_tensors:
        max_new_tokens = output_length_sampler()        
            
        generation_kwargs["max_new_tokens"] = max_new_tokens
        summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)
        
        summary_tensors.append(summary.squeeze()[-max_new_tokens:])
        
    # This needs to be called "response".
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in summary_tensors]

    # Compute reward outputs.
    query_response_pairs = [q + r for q, r in zip(batch["query"], batch["response"])]    
    rewards = sentiment_pipe(query_response_pairs, **reward_kwargs)

    # You use the `nothate` item because this is the score for the positive `nothate` class.
    reward_tensors = [torch.tensor(reward[not_hate_index]["score"]) for reward in rewards]    

    # Run PPO step.
    stats = ppo_trainer.step(prompt_tensors, summary_tensors, reward_tensors)
    ppo_trainer.log_stats(stats, batch, reward_tensors)
    
    print(f'objective/kl: {stats["objective/kl"]}')
    print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}')
    print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}')
    print('-'.join('' for x in range(100)))

## Evaluate the Model Quantitatively

In [None]:
mean_after_detoxification, std_after_detoxification = evaluate_toxicity(model=ppo_model, 
                                                                        toxicity_evaluator=toxicity_evaluator, 
                                                                        tokenizer=tokenizer, 
                                                                        dataset=dataset["test"], 
                                                                        num_samples=10)
print(f'toxicity [mean, std] after detox: [{mean_after_detoxification}, {std_after_detoxification}]')

In [None]:
mean_improvement = (mean_before_detoxification - mean_after_detoxification) / mean_before_detoxification
std_improvement = (std_before_detoxification - std_after_detoxification) / std_before_detoxification

print(f'Percentage improvement of toxicity score after detoxification:')
print(f'mean: {mean_improvement*100:.2f}%')
print(f'std: {std_improvement*100:.2f}%')

## Evaluate the Model Qualitatively

In [None]:
# Choose a few samples in the dataset as prompts to the reference model and the ppo model.
# Check their completions and compare the reward values given by the toxicity evaluator.
# NOTE: This section is not graded.