In [None]:
!pip install --disable-pip-version-check torch==1.13.1 torchdata==0.5.1 --quiet
!pip install transformers datasets evaluate rouge_score peft --quiet
!pip install git+https://github.com/lvwerra/trl.git@25fa1bd

In [None]:
import numpy as np
import pandas as pd
import torch
import evaluate
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    GenerationConfig
)
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType
from trl import (
    PPOTrainer,
    PPOConfig,
    AutoModelForSeq2SeqLMWithValueHead,
    create_reference_model
)
from tqdm import tqdm
from trl.core import LengthSampler
tqdm.pandas()

# 1) Data Loading

In [None]:
def build_dataset(
    model_name,
    dataset_name,
    input_min_text_length,
    input_max_text_length):

    """
    Preprocess the dataset and split it into train and test parts.
    """
    dataset = load_dataset(dataset_name, split="train")
    dataset = dataset.filter(
        lambda x: len(x["dialogue"]) > input_min_text_length and \
            len(x["dialogue"]) <= input_max_text_length,
        batched=False
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

    def tokenize(sample):
        prompt = f"""Summarize the following conversation.\
        {sample["dialogue"]}\
        Summary:
        """
        sample["input_ids"] = tokenizer.encode(prompt)

        # "query" is a requirement of PPO library.
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    # Tokenize each dialogue.
    dataset = dataset.map(tokenize, batched=False)
    dataset.set_format(type="torch")
    dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)
    return dataset_splits

### build dataser
huggingface_dataset_name = "knkarthick/dialogsum"
model_name = "google/flan-t5-small"
dataset = build_dataset(
    model_name=model_name,
    dataset_name=huggingface_dataset_name,
    input_min_text_length=200,
    input_max_text_length=1000
)
print(dataset)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Map:   0%|          | 0/10022 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 8017
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 2005
    })
})


# 2) Model Loading

During PPO, only a few parameters will be updated. Specifically, the parameters of the ValueHead.

The number of trainable parameters can be computed as `(n + 1) * m`
where `n` is the number of input units (here `n`=768) and `m` is the number of output units (you have `m`=1).

## 2.1 PPO Model

In [None]:
model_name = "google/flan-t5-small"

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16
)

peft_checkpoint = 'Yasbok/Flan-t5-fine-tune-PEFT-Lora'
peft_model = PeftModel.from_pretrained(
    model,
    peft_checkpoint,
    lora_config=lora_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    is_trainable=True
)

# PPO model parameters to be updated (ValueHead + 769 params)
ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(
    peft_model,
    torch_dtype=torch.bfloat16,
    is_trainable=True
)
# copy of the PPO which will not be fine-tuned - a reference model
ref_model = create_reference_model(ppo_model)

print(ppo_model.v_head)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

ValueHead(
  (dropout): Dropout(p=0.1, inplace=False)
  (summary): Linear(in_features=512, out_features=1, bias=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
)


## 2.2 Reward Model

Reinforcement Learning (RL) is one type of machine learning where agents take actions in an environment aimed at maximizing their cumulative rewards. The agent's behavior is defined by the policy. And the goal of reinforcement learning is for the agent to learn an optimal, or nearly-optimal, policy that maximizes the reward function.

The original policy is based on the instruct PEFT model - this is the LLM before detoxification. Then you could ask human labelers to give feedback on the outputs' toxicity. However, it can be expensive to use them for the entire fine-tuning process. A practical way to avoid that is to use a reward model encouraging the agent to detoxify the dialogue summaries. The intuitive approach would be to do some form of sentiment analysis across two classes (nothate and hate) and give a higher reward if there is higher a chance of getting class nothate as an output.

In [None]:
toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name, device_map="auto")
toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name, device_map="auto")
print(toxicity_model.config.id2label)

### model test
toxic_text = "#Person 1# tells Tommy that the movie was terrible, dumb and stupid."
toxicity_input_ids = toxicity_tokenizer(toxic_text, return_tensors="pt").input_ids
logits = toxicity_model(toxicity_input_ids).logits
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

# Print the probabilities for [not hate, hate]
probabilities = logits.softmax(dim=-1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

# Get the logits for "not hate" - this is the reward!
not_hate_index = 0
nothate_reward = (logits[:, not_hate_index]).tolist()
print(f'reward (low): {nothate_reward}')

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

{0: 'nothate', 1: 'hate'}
logits [not hate, hate]: [-0.6921197175979614, 0.3722734749317169]
probabilities [not hate, hate]: [0.25647082924842834, 0.743529200553894]
reward (low): [-0.6921197175979614]


In [None]:
### or we can use huggingface pipeline
device = 'cuda'
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model=toxicity_model_name,
    device=device
)

reward_logits_kwargs = {
    # Return all scores.
    "top_k": None,
    # Set to "none" to retrieve raw logits.
    "function_to_apply": "none",
    "batch_size": 16
}

reward_probabilities_kwargs = {
    # Return all scores.
    "top_k": None,
    # Set to "softmax" to apply softmax and retrieve probabilities.
    "function_to_apply": "softmax",
    "batch_size": 16
}

print("Reward model output:")
print("For non-toxic text")
print(sentiment_pipe(toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(toxic_text, **reward_probabilities_kwargs))

Reward model output:
For non-toxic text
[{'label': 'hate', 'score': 0.3722734749317169}, {'label': 'nothate', 'score': -0.6921197175979614}]
[{'label': 'hate', 'score': 0.743529200553894}, {'label': 'nothate', 'score': 0.25647082924842834}]


[Hugginface Evaluate Library](https://huggingface.co/spaces/evaluate-measurement/toxicity)

In [None]:
toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_evaluator = evaluate.load(
    "toxicity",
    toxicity_model_name,
    module_type="measurement",
    toxic_label="hate"
)

toxicity_score = toxicity_evaluator.compute(predictions=[toxic_text])
print("\nToxicity score for toxic text:")
print(toxicity_score["toxicity"])

Downloading builder script:   0%|          | 0.00/6.08k [00:00<?, ?B/s]

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.



Toxicity score for toxic text:
[0.743529200553894]


In [None]:
def evaluate_toxicity(
    model,
    toxicity_evaluator,
    tokenizer,
    dataset,
    num_samples):

    """
    Preprocess the dataset and split it into train and test parts.

    Parameters:
    - model (trl model): Model to be evaluated.
    - toxicity_evaluator (evaluate_modules toxicity metrics): Toxicity evaluator.
    - tokenizer (transformers tokenizer): Tokenizer to be used.
    - dataset (dataset): Input dataset for the evaluation.
    - num_samples (int): Maximum number of samples for the evaluation.
    """

    max_new_tokens=100
    toxicities = []
    input_texts = []
    for i, sample in tqdm(enumerate(dataset)):
        input_text = sample["query"]
        input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids
        generation_config = GenerationConfig(
            max_new_tokens=max_new_tokens,
            tok_k=0.0,
            top_p=1.0,
            do_sample=True)
        response_token_ids = model.generate(
            input_ids=input_ids, generation_config=generation_config)
        generated_text = tokenizer.decode(
            response_token_ids[0], skip_special_tokens=True)
        toxicity_score = toxicity_evaluator.compute(
            predictions=[(input_text + " " + generated_text)])
        toxicities.extend(toxicity_score["toxicity"])

        if i >= num_samples:
            print((input_text + " " + generated_text))
            break

    # Compute mean & std using np.
    mean = np.mean(toxicities)
    std = np.std(toxicities)
    return mean, std

### mean before RL
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")
mean_before, std_before = evaluate_toxicity(
    model=ref_model,
    toxicity_evaluator=toxicity_evaluator,
    tokenizer=tokenizer,
    dataset=dataset["test"],
    num_samples=10
)
print(mean_before, std_before)

10it [02:21, 14.18s/it]

Summarize the following conversation. #Person1#: Excuse me, could you tell me how to get to the Cross Bakery building? #Person2#: The Cross Bakery building? Oh sure. You're actually walking in the opposite direction. #Person1#: Oh, you're kidding! I thought I was heading east. #Person2#: No, east is the other direction. To get to the Bakery, you need to turn around and go three blocks to Broadway. When you get to the intersection of Broadway and Elm, you hang a left. Go straight down that street for half a block and then you'll see the building on your left. #Person1#: Okay, let me see if I've got that. I need to go down Elm until I hit Broadway, then I make a left and the building is on my left hand side. Is that right? #Person2#: Yeah, you've got it. Do you want me to show you the way? #Person1#: Thanks for the offer, but I think I've got it. Hopefully, I won't get lost again on my way there! Summary: </s> positive
0.011468970407308503 0.013527138737984034





# 3) Fine Tuning

The fine-tuning loop consists of the following main steps:

1. Get the query responses from the policy LLM (PEFT model).
2. Get sentiments for query/responses from hate speech RoBERTa model.
3. Optimize policy with PPO using the (query, response, reward) triplet.

Metrics:

1. objective/kl: minimize kl divergence,
2. ppo/returns/mean: maximize mean returns,
3. ppo/policy/advantages_mean: maximize advantages.

In [None]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

learning_rate=1.4e-5
max_ppo_epochs=1
mini_batch_size=4
batch_size=16

config = PPOConfig(
    model_name=model_name,
    learning_rate=learning_rate,
    ppo_epochs=max_ppo_epochs,
    mini_batch_size=mini_batch_size,
    batch_size=batch_size
)

ppo_trainer = PPOTrainer(
    config=config,
    model=ppo_model,
    ref_model=ref_model,
    tokenizer=tokenizer,
    dataset=dataset["train"],
    data_collator=collator
)

In [None]:
output_min_length = 100
output_max_length = 400
### random sampling on the model generation length
output_length_sampler = LengthSampler(
    output_min_length, output_max_length
)

generation_kwargs = {
    "min_length": 5,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True
}

reward_kwargs = {
    # Return all scores.
    "top_k": None,
    # raw logits without softmax.
    "function_to_apply": "none",
    "batch_size": 16
}

max_ppo_steps = 10
for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    # Break when you reach max_steps.
    if step >= max_ppo_steps:
        break

    prompt_tensors = batch["input_ids"]
    summary_tensors = []
    for prompt_tensor in prompt_tensors:
        max_new_tokens = output_length_sampler()
        generation_kwargs["max_new_tokens"] = max_new_tokens
        summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)
        summary_tensors.append(summary.squeeze()[-max_new_tokens:])

    # This needs to be called "response".
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in summary_tensors]

    # Compute reward outputs.
    query_response_pairs = [q + r for q, r in zip(batch["query"], batch["response"])]
    rewards = sentiment_pipe(query_response_pairs, **reward_kwargs)
    not_hate_index = 0
    reward_tensors = [torch.tensor(reward[not_hate_index]["score"]) for reward in rewards]

    # Run PPO step.
    stats = ppo_trainer.step(prompt_tensors, summary_tensors, reward_tensors)
    ppo_trainer.log_stats(stats, batch, reward_tensors)

    print(f'objective/kl: {stats["objective/kl"]}')
    print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}')
    print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}')
    print('-'.join('' for x in range(100)))


# 4) Evaluation

## 4.1 Quantitative

In [None]:
mean_after, std_after = evaluate_toxicity(
    model=ppo_model,
    toxicity_evaluator=toxicity_evaluator,
    tokenizer=tokenizer,
    dataset=dataset["test"],
    num_samples=10
)
print(mean_after, std_after)

mean_improvement = (mean_before - mean_after) / mean_before
std_improvement = (std_before - std_after) / std_before

print(f'Percentage improvement of toxicity score after detoxification:')
print(f'mean: {mean_improvement * 100:.2f}%')
print(f'std: {std_improvement * 100:.2f}%')

## 4.2 Quantitative

Compare DataFrame

In [None]:
batch_size = 20
compare_results = {}

df_batch = dataset["test"][0:batch_size]
compare_results["query"] = df_batch["query"]
prompt_tensors = df_batch["input_ids"]
summary_tensors_ref = []
summary_tensors = []

# Get response from ppo and base model.
for i in tqdm(range(batch_size)):
    gen_len = output_length_sampler()
    generation_kwargs["max_new_tokens"] = gen_len
    input = torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device)
    summary = ref_model.generate(input_ids=input, **generation_kwargs).squeeze()[-gen_len:]
    summary_tensors_ref.append(summary)

    input = torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device)
    summary = ppo_model.generate(input_ids=input, **generation_kwargs).squeeze()[-gen_len:]
    summary_tensors.append(summary)

# Decode responses.
compare_results["response_before"] = [tokenizer.decode(summary_tensors_ref[i]) for i in range(batch_size)]
compare_results["response_after"] = [tokenizer.decode(summary_tensors[i]) for i in range(batch_size)]

# Sentiment analysis of query/response pairs before/after.
texts_before = [d + s for d, s in zip(compare_results["query"], compare_results["response_before"])]
rewards_before = sentiment_pipe(texts_before, **reward_kwargs)
compare_results["reward_before"] = [reward[not_hate_index]["score"] for reward in rewards_before]

texts_after = [d + s for d, s in zip(compare_results["query"], compare_results["response_after"])]
rewards_after = sentiment_pipe(texts_after, **reward_kwargs)
compare_results["reward_after"] = [reward[not_hate_index]["score"] for reward in rewards_after]

In [None]:
pd.set_option('display.max_colwidth', 500)
df_compare_results = pd.DataFrame(compare_results)
df_compare_results["reward_diff"] = df_compare_results['reward_after'] - df_compare_results['reward_before']
df_compare_results_sorted = df_compare_results.sort_values(by=['reward_diff'], ascending=False).reset_index(drop=True)
print(df_compare_results_sorted)

# 5) PPO from Scratch

In [None]:
bert_model = transformers.BertModel.from_pretrained('bert-base-uncased')

class PolicyNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.softmax(self.fc2(x), dim=-1)
        return x

# Define the PPO agent
class PPOAgent:
    def __init__(self, input_size, output_size, hidden_size=64, lr=0.001):
        self.policy_network = PolicyNetwork(input_size, hidden_size, output_size)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=lr)
        self.gamma = 0.99
        self.epsilon = 0.2

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32)
        action_prob = self.policy_network(state)
        action = torch.multinomial(action_prob, 1)
        return action.item(), action_prob[0][action]

    def update(self, states, actions, old_probs, advantages):
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        old_probs = torch.tensor(old_probs, dtype=torch.float32)
        advantages = torch.tensor(advantages, dtype=torch.float32)

        new_probs = self.policy_network(states).gather(1, actions.unsqueeze(1))
        ratio = new_probs / old_probs
        clip_loss = torch.min(
            ratio * advantages, torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon) * advantages
        )
        loss = -torch.mean(clip_loss)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


In [None]:
agent = PPOAgent(input_size, output_size)

# Environment setup (replace 'CartPole-v1' with your custom environment)
env = gym.make('CartPole-v1')
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

# Training loop
num_episodes = 1000
for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0
    states, actions, rewards, old_probs = [], [], [], []

    while not done:
        action, action_prob = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)

        states.append(state)
        actions.append(action)
        rewards.append(reward)
        old_probs.append(action_prob)

        state = next_state
        total_reward += reward

    # Calculate advantages
    advantages = calculate_advantages(rewards)

    # Update the policy network using PPO
    agent.update(states, actions, old_probs, advantages)

    # Print episode information
    print(f"Episode {episode + 1}: Total Reward: {total_reward}")