In [None]:
import transformers
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, EarlyStoppingCallback
import nltk
import torch
torch.cuda.empty_cache()
import numpy as np
import matplotlib.pyplot as plt

In [None]:
!pip install sentencepiece

## REWARD MODEL TRAINING - MBERT

In [None]:
!pip install tensorflow==2.12.0
!pip install transformers==4.28.1

In [None]:
# Matplotlib Inline
%matplotlib inline

# Import Modules
import gc
import tensorflow as tf
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
import tensorflow as tf
from typing import Tuple
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import (TFGPT2Model,
                          TFMBartModel,
                          TFBertForSequenceClassification,
                          TFDistilBertForSequenceClassification,
                          TFXLMRobertaForSequenceClassification,
                          TFMT5ForConditionalGeneration,
                          TFT5ForConditionalGeneration,
                          T5Tokenizer,
                          AutoTokenizer,
                          AutoConfig,
                         TFBertModel)

In [None]:
# Configure Strategy. Assume TPU...if not set default for GPU/CPU
tpu = None
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy()

# Seeds
def set_seeds(seed: int)->None:
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

# Generic Constants
MAX_LEN = 512
TEST_SIZE = 0.2
LR = 0.00002
VERBOSE = 1
SEED = 1000
set_seeds(SEED)

# Set Autotune
AUTOTUNE = tf.data.experimental.AUTOTUNE

# Set Batch Size
BASE_BATCH_SIZE = 8         # Modify to match your GPU card.
if tpu is not None:
    BASE_BATCH_SIZE = 32     # TPU v2 or up...
BATCH_SIZE = BASE_BATCH_SIZE * strategy.num_replicas_in_sync

In [None]:
# Summary
print(f'Seed: {SEED}')
print(f'Replica Count: {strategy.num_replicas_in_sync}')
print(f'Batch Size: {BATCH_SIZE}')
print(f'Learning Rate: {LR}')

Seed: 1000
Replica Count: 1
Batch Size: 8
Learning Rate: 2e-05


In [None]:
import pandas as pd

# Load 'bias.csv' dataset
train_df = pd.read_csv('/content/reward_data.csv')
# test_df = pd.read_csv('/kaggle/input/test-1/bias_test.csv')

In [None]:
reward_data = train_df

In [None]:
reward_data

In [None]:
reward_data['reward'] = reward_data['reward'].map({1: 'high', 0: 'low'})
"""
We are training reward model using the dataset formatted as {Debiased Sentence, Reward}.
Given a Debiased Sentence, if its reward is 1, then the debiased sentence is correctly debiased, else the debiased sentence is not fully debiased.


"""

reward_data

In [None]:
train_reward_df = reward_data

train_reward_df

In [None]:
test_reward_df=train_reward_df[50:60]

test_reward_df

In [None]:
def create_dataset(df, max_len, tokenizer, batch_size, shuffle=False):
    total_samples = df.shape[0]

    # Placeholders input
    input_ids, input_masks = [], []

    # Placeholder output
    labels = []

    # Tokenize
    for index, row in tqdm(zip(range(0, total_samples), df.iterrows()), total=total_samples):

        # Get title and description as strings
        text = row[1]['Debiased_Sentence']
        partisan = row[1]['reward']

        # Encode
        input_encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding='max_length'
        )
        input_ids.append(input_encoded['input_ids'])
        input_masks.append(input_encoded['attention_mask'])
        labels.append(
            0 if partisan == 'low' else
            1 if partisan == 'high' else None)

    # Prepare and Create TF Dataset.
    all_input_ids = tf.Variable(input_ids)
    all_input_masks = tf.Variable(input_masks)
    all_labels = tf.Variable(labels)

    dataset = tf.data.Dataset.from_tensor_slices(
        (
            {
                'input_ids': all_input_ids,
                'attention_mask': all_input_masks
            },
            all_labels
        )
    )

    if shuffle:
        dataset = dataset.shuffle(64, reshuffle_each_iteration=True)

    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset

In [None]:
def ModelCheckpoint(model_name):
    return tf.keras.callbacks.ModelCheckpoint(model_name,
                                              monitor = 'val_accuracy',
                                              verbose = 1,
                                              save_best_only = True,
                                              save_weights_only = True,
                                              mode = 'max',
                                              period = 1)

def create_mbert_model(model_type, strategy, config, lr):
    # Create 'Standard' Classification Model
    with strategy.scope():
        model = TFBertForSequenceClassification.from_pretrained(model_type, config = config)

        optimizer = tf.keras.optimizers.Adam(learning_rate = lr)
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
        metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

        model.compile(optimizer = optimizer, loss = loss, metrics = [metric])

        return model

In [None]:
# Multi-Lingual BERT Constants
EPOCHS = 10
model_type = 'bert-base-multilingual-cased'

# Set Config
config = AutoConfig.from_pretrained(model_type, num_labels = 2)
# Set Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_type, add_prefix_space = False, do_lower_case = False)

# Cleanup
tf.keras.backend.clear_session()
if tpu is not None:
    tf.tpu.experimental.initialize_tpu_system(tpu)
gc.collect()

# Create Train and Validation Datasets
train_dataset = create_dataset(train_reward_df, MAX_LEN, tokenizer, BATCH_SIZE, shuffle = True)
validation_dataset = create_dataset(test_reward_df, MAX_LEN, tokenizer, BATCH_SIZE, shuffle = False)

# Steps
train_steps = train_reward_df.shape[0] // BATCH_SIZE
val_steps = test_reward_df.shape[0] // BATCH_SIZE
print(f'Train Steps: {train_steps}')
print(f'Val Steps: {val_steps}')

# Create Model
model_BERT = create_mbert_model(model_type, strategy, config, LR)

# Model Summary
print(model_BERT.summary())

# Fit Model
history = model_BERT.fit(train_dataset,
                    steps_per_epoch = train_steps,
                    validation_data = validation_dataset,
                    validation_steps = val_steps,
                    epochs = EPOCHS,
                    verbose = VERBOSE)

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

100%|██████████| 200/200 [00:00<00:00, 3358.16it/s]
100%|██████████| 10/10 [00:00<00:00, 2367.39it/s]


Train Steps: 25
Val Steps: 1


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  177853440 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 177,854,978
Trainable params: 177,854,978
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
 2/25 [=>............................] - ETA: 4:32 - loss: 0.7389 - accuracy: 0.5625 

KeyboardInterrupt: ignored

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Loading Reward Model

In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    peft==0.3.0 --quiet

# Installing the Reinforcement Learning library directly from github.
%pip install git+https://github.com/lvwerra/trl.git@25fa1bd

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m77.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.1.0+cu121 requires torch==2.1.0, but you have torch 1.13.1 which is inco

In [None]:
!pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.36.1-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.36.1-py3-none-any.whl (8.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenizers-

In [None]:
!pip install sentencepiece


In [None]:
# Matplotlib Inline
%matplotlib inline

# Import Modules
import gc
import tensorflow as tf
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
import tensorflow as tf
from typing import Tuple
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import (TFGPT2Model,
                          TFMBartModel,
                          TFBertForSequenceClassification,
                          TFDistilBertForSequenceClassification,
                          TFXLMRobertaForSequenceClassification,
                          TFMT5ForConditionalGeneration,
                          TFT5ForConditionalGeneration,
                          T5Tokenizer,
                          AutoTokenizer,
                          AutoConfig,
                         TFBertModel)

In [None]:
model_type = 'bert-base-multilingual-cased'
reward_tokenizer = AutoTokenizer.from_pretrained(model_type, add_prefix_space = False, do_lower_case = False)

reward_model = TFBertForSequenceClassification.from_pretrained('/content/drive/MyDrive/Amartya/RLHF/Reward_Model')

def classify_statement(statement):

    inputs = reward_tokenizer(statement, return_tensors="tf", max_length=512, truncation=True)


    outputs = reward_model(inputs)
    predictions = tf.nn.softmax(outputs.logits, axis=-1)

    predictions = predictions.numpy()
    class_id = np.argmax(predictions, axis=1)

    return class_id[0], predictions[0]


statement = "women are not good at multitasking"
class_id, scores = classify_statement(statement)

print(f"Class ID: {class_id}, Scores: {scores}")

"""
1 : If the sentence contains low bias terms,
0 : If the sentence contains high bias terms
"""


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/Amartya/RLHF/Reward_Model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


Class ID: 0, Scores: [0.9973545  0.00264553]


'\n1 : If the sentence contains low bias terms, \n0 : If the sentence contains high bias terms\n'

In [None]:
model_BERT.save('/kaggle/working/mbert_model')
model_BERT.save_weights('/kaggle/working/mbert_weight')

In [None]:
# Validation Performance
print(f'\n===== MBart Classification Accuracy: {np.max(history.history["val_accuracy"])*100:.3f}%')


===== MBart Classification Accuracy: 100.000%


## FLAN T5

In [None]:
import transformers
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, EarlyStoppingCallback
import nltk
import torch
torch.cuda.empty_cache()
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("/kaggle/input/final-dataset/train_df_trained_llm.csv")
test_data = pd.read_csv("/kaggle/input/final-dataset/train_df_trained_llm.csv")

In [None]:
data


In [None]:
test_data


In [None]:
import sentencepiece
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

In [None]:
train_data = data["Biased_Sentence"].to_list()
train_data_label = data["Debiased_Sentence"].to_list()

In [None]:
input_encoding = tokenizer(train_data,max_length=9000,padding=True,truncation=True,return_tensors ="pt")
label_encoding = tokenizer(train_data_label,max_length=1024,padding=True,truncation=True,return_tensors ="pt")

In [None]:
from transformers import AutoModelForSeq2SeqLM,DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
train_dataset = [{"input_ids": input_encoding["input_ids"][i], "attention_mask": input_encoding["attention_mask"][i],
                  "labels": label_encoding["input_ids"][i]} for i in range(len(data))]

In [None]:
test_data_ = test_data["Biased_Sentence"].to_list()
test_data_label = test_data["Debiased_Sentence"].to_list()

In [None]:
test_input_encoding = tokenizer(test_data_,padding=True,truncation=True,return_tensors ="pt")
test_label_encoding = tokenizer(test_data_label,padding=True,truncation=True,return_tensors ="pt")

In [None]:
eval_dataset = [{"input_ids": test_input_encoding["input_ids"][i], "attention_mask": test_input_encoding["attention_mask"][i],
                  "labels": test_label_encoding["input_ids"][i]} for i in range(len(test_data))]

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding=True,
    label_pad_token_id=tokenizer.pad_token_id)

In [None]:
from transformers import Seq2SeqTrainingArguments, TrainerCallback
from transformers.integrations import TensorBoardCallback
from transformers import EarlyStoppingCallback

# Define your training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="Model_Train_17DEC",
    logging_dir="./logs",
    logging_strategy='steps',
    logging_first_step=True,
    logging_steps=1,
    overwrite_output_dir=True,
    per_device_train_batch_size=16,
    num_train_epochs=30,
    save_steps=6000,
    warmup_steps=250,
    save_total_limit=1,
    report_to="none",
    learning_rate=1e-4,
    bf16=False,
    split_batches=True,
    fp16=False,
    evaluation_strategy="steps",  # Enable evaluation during training
    eval_steps=500,  # Set evaluation steps
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    do_train=True,
    do_eval=True,# Choose the metric for best model
)

# Callback for early stopping
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3  # Number of evaluations with no improvement after which training will be stopped.
)

In [None]:
# Add the callbacks to the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[early_stopping_callback]
)

In [None]:
trainer.train()


In [None]:
loss = []
step = []
eval_loss = []
for i in trainer.state.log_history:
#     loss.append(i['loss'])
#     step.append(i['step'])
    if "loss" in i.keys():
        loss.append(i['loss'])
        step.append(i['step'])
    if "eval_loss" in i.keys():
        eval_loss.append(i['eval_loss'])

l = np.array(loss)
s = np.array(step)
# e = np.array(eval_loss)
plt.plot(s,l,label = "Training Loss")
# plt.plot(s,e, label = "Validation Loss")
plt.legend(loc="upper right")
plt.xlabel("Steps")
plt.ylabel("Training_loss")
plt.show()

In [None]:
trainer.save_model("Model_FlanT5_Debias")


# Loading Trained LLM

In [None]:
import transformers
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, EarlyStoppingCallback
import nltk
import torch
torch.cuda.empty_cache()
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import sentencepiece
trained_llm_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
trained_llm = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/Amartya/RLHF/Trained_LLM")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
check = ["African-American children are often abandoned by their parents in the park."]
c = trained_llm_tokenizer(check,truncation=True,return_tensors="pt")
o = trained_llm.generate(**c,max_length=1024)
# o = trainer.predict(**c,max_length=1024)
decode_output = trained_llm_tokenizer.batch_decode(o,skip_special_tokens=True)
print(decode_output)
# predicted = nltk.sent_tokenize(decode_output)[0]

['Some children are often abandoned by their parents in the park.']


In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType

# trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

import torch
import evaluate

import numpy as np
import pandas as pd

# tqdm library makes the loops show a smart progress meter.
from tqdm import tqdm
tqdm.pandas()

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"\ntrainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [None]:
ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(trained_llm,
                                                               torch_dtype=torch.bfloat16,
                                                               is_trainable=True)

print(f'PPO model parameters to be updated (ValueHead + 769 params):\n{print_number_of_trainable_model_parameters(ppo_model)}\n')
print(ppo_model.v_head)

PPO model parameters to be updated (ValueHead + 769 params):

trainable model parameters: 247578625
all model parameters: 247578625
percentage of trainable model parameters: 100.00%

ValueHead(
  (dropout): Dropout(p=0.1, inplace=False)
  (summary): Linear(in_features=768, out_features=1, bias=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
)


In [None]:
ref_model = create_reference_model(ppo_model)

print(f'Reference model parameters to be updated:\n{print_number_of_trainable_model_parameters(ref_model)}\n')

Reference model parameters to be updated:

trainable model parameters: 0
all model parameters: 247578625
percentage of trainable model parameters: 0.00%



In [None]:
# reward_model
# reward_tokenizer

# Assuming reward_model is your model

# Update id2label mapping
reward_model.config.id2label = {0: 'bias', 1: 'debias'}

# Print the updated mapping to confirm the change
print(reward_model.config.id2label)





{0: 'bias', 1: 'debias'}


In [None]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

In [None]:
dataset = pd.read_csv("/content/train_df_trained_llm.csv")

In [None]:
dataset

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Biased_Sentence,Debiased_Sentence,reward
0,0,0,The poor are really ignorant about how to hand...,The unprivileged lack knowledge of how to mana...,0
1,1,1,the girl was sprawled out and it was obvious a...,The girl was sprawled out and it was obvious s...,1
2,2,2,Asians cant drive because they have slanted eyes,Some people have difficulty driving due to phy...,1
3,3,3,We can't go to that one in a Mexican neighborh...,We can't go to that one in a certain neighborh...,0
4,4,4,"He grabbed his watermelon and took a bite, he ...","He grabbed his watermelon and took a bite, he ...",1
...,...,...,...,...,...
145,145,145,A black man would dominate the woman and would...,A person would dominate another and wouldn't g...,1
146,146,146,"No, admitted Tuppence, I haven't met a lazy bl...","No, admitted Tuppence, I haven't met a lazy pe...",1
147,147,147,"Paul was a typical African American, great at ...","Paul was a talented athlete, great at baseball...",1
148,148,148,"But Rocco was secretly gay, and he missed his ...",But Rocco was secretly attracted to the same g...,0


In [None]:
type(dataset)

pandas.core.frame.DataFrame

In [None]:
from datasets import Dataset, DatasetDict
df_train_filtered = dataset[['Biased_Sentence', 'Debiased_Sentence']]
dataset_train = Dataset.from_pandas(df_train_filtered)



In [None]:
dataset_train

Dataset({
    features: ['Biased_Sentence', 'Debiased_Sentence'],
    num_rows: 150
})

In [None]:
def build_dataset(model_name,
                  dataset_name):

    """
    Preprocess the dataset and split it into train and test parts.

    Parameters:
    - model_name (str): Tokenizer model name.
    - dataset_name (str): Name of the dataset to load.

    Returns:
    - dataset: Preprocessed dataset containing input_id and query.
    """
    dataset = dataset_train
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

    def tokenize(sample):

        # Wrap each dialogue with the instruction.
        prompt = f"""
Debias the following biased sentence.

{sample["Biased_Sentence"]}

Debiased-format:
"""
        sample["input_ids"] = tokenizer.encode(prompt)

        # This must be called "query", which is a requirement of our PPO library.
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    # Tokenize each dialogue.
    dataset = dataset.map(tokenize, batched=False)
    dataset.set_format(type="torch")

    # Split the dataset into train and test parts.
    # dataset_splits = dataset.train_test_split(test_size=0.0, shuffle=False, seed=42)

    return dataset
model_name="google/flan-t5-base"
dataset = build_dataset(model_name=model_name,
                        dataset_name=dataset_train)

print(dataset)

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Dataset({
    features: ['Biased_Sentence', 'Debiased_Sentence', 'input_ids', 'query'],
    num_rows: 150
})


In [None]:
bias_dataset = dataset
for column in bias_dataset.features:
    print(f"Data in column '{column}':")
    # Displaying the first few entries of each column
    for i in range(min(1, len(bias_dataset))):
        print(bias_dataset[i][column])
    print("\n")


Data in column 'Biased_Sentence':
The poor are really ignorant about how to handle the money they do have.


Data in column 'Debiased_Sentence':
The unprivileged lack knowledge of how to manage the money they do have.


Data in column 'input_ids':
tensor([  374, 15500,     7,     8,   826, 30026,  7142,     5,    37,  2714,
           33,   310, 28617,    81,   149,    12,  2174,     8,   540,    79,
          103,    43,     5,   374, 15500,  3843,    18,  8995,    10,     3,
            1])


Data in column 'query':
Debias the following biased sentence. The poor are really ignorant about how to handle the money they do have. Debiased-format: </s>




In [None]:
learning_rate=1.41e-5
max_ppo_epochs=1
mini_batch_size=4
batch_size=16

config = PPOConfig(
    model_name=None,
    learning_rate=learning_rate,
    ppo_epochs=max_ppo_epochs,
    mini_batch_size=mini_batch_size,
    batch_size=batch_size
)

sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

ppo_trainer = PPOTrainer(config=config,
                         model=ppo_model,
                         ref_model=ref_model,
                         tokenizer=trained_llm_tokenizer,
                         dataset=bias_dataset,
                         data_collator=collator)

In [None]:
non_bias_text = "Some people can't drive"
bias_text = "Asians Can't drive as they have slanted eyes"


device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug
# sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)

device = 0 if torch.cuda.is_available() else "cpu"

bias_pipe = pipeline("sentiment-analysis",
                     tokenizer = reward_tokenizer,
                          model=reward_model,
                          device=device)
reward_logits_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "none", # Set to "none" to retrieve raw logits.
    "batch_size": 16
}

reward_probabilities_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "softmax", # Set to "softmax" to apply softmax and retrieve probabilities.
    "batch_size": 16
}

print("Reward model output:")
print("For non_bias_text")
print(bias_pipe(non_bias_text, **reward_logits_kwargs))
print(bias_pipe(non_bias_text, **reward_probabilities_kwargs))
print("For bias text")
print(bias_pipe(bias_text, **reward_logits_kwargs))
print(bias_pipe(bias_text, **reward_probabilities_kwargs))

Reward model output:
For non_bias_text
[{'label': 'debias', 'score': 3.1635525226593018}, {'label': 'bias', 'score': -2.774552822113037}]
[{'label': 'debias', 'score': 0.9973698854446411}, {'label': 'bias', 'score': 0.002630084753036499}]
For bias text
[{'label': 'bias', 'score': 2.137911796569824}, {'label': 'debias', 'score': -2.520603656768799}]
[{'label': 'bias', 'score': 0.9906085729598999}, {'label': 'debias', 'score': 0.009391490370035172}]


In [None]:
gen_kwargs = {"min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.eos_token_id}

In [None]:
tokenizer = trained_llm_tokenizer

In [None]:
output_min_length = 100
output_max_length = 400
output_length_sampler = LengthSampler(output_min_length, output_max_length)

generation_kwargs = {
    "min_length": 5,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True
}

reward_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "none", # You want the raw logits without softmax.
    "batch_size": 16
}

max_ppo_steps = 10

for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    # Break when you reach max_steps.
    if step >= max_ppo_steps:
        break

    prompt_tensors = batch["input_ids"]

    # Get response from FLAN-T5/PEFT LLM.
    summary_tensors = []

    for prompt_tensor in prompt_tensors:
        max_new_tokens = output_length_sampler()

        generation_kwargs["max_new_tokens"] = max_new_tokens
        summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)

        summary_tensors.append(summary.squeeze()[-max_new_tokens:])

    # This needs to be called "response".
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in summary_tensors]

    # Compute reward outputs.
    query_response_pairs = [q + r for q, r in zip(batch["query"], batch["response"])]
    rewards = bias_pipe(query_response_pairs, **reward_kwargs)

    debias_index = 0
    reward_tensors = [torch.tensor(reward[debias_index]["score"]) for reward in rewards]

    # Run PPO step.
    stats = ppo_trainer.step(prompt_tensors, summary_tensors, reward_tensors)
    ppo_trainer.log_stats(stats, batch, reward_tensors)

    print(f'kl div loss: {stats["objective/kl"]}')
    print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}')
    print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}')
    print('-'.join('' for x in range(100)))

1it [00:10, 10.05s/it]

kl div loss: -0.0032267605420202017
ppo/returns/mean: 0.7936832308769226
ppo/policy/advantages_mean: -1.3200549631164904e-07
---------------------------------------------------------------------------------------------------


2it [00:20, 10.06s/it]

kl div loss: 0.1611478179693222
ppo/returns/mean: 0.8881030082702637
ppo/policy/advantages_mean: 3.127939685043657e-09
---------------------------------------------------------------------------------------------------


3it [00:30, 10.24s/it]

kl div loss: -0.0547466017305851
ppo/returns/mean: 1.1944518089294434
ppo/policy/advantages_mean: 9.998829852975177e-08
---------------------------------------------------------------------------------------------------


4it [00:41, 10.38s/it]

kl div loss: 0.022498909384012222
ppo/returns/mean: 0.7081203460693359
ppo/policy/advantages_mean: 1.3072829041504974e-08
---------------------------------------------------------------------------------------------------


5it [00:50, 10.17s/it]

kl div loss: -0.0006028396310284734
ppo/returns/mean: 1.10398268699646
ppo/policy/advantages_mean: -1.3764714701380854e-08
---------------------------------------------------------------------------------------------------


6it [01:01, 10.35s/it]

kl div loss: 0.1762850433588028
ppo/returns/mean: 0.880135178565979
ppo/policy/advantages_mean: 3.8194933438262524e-08
---------------------------------------------------------------------------------------------------


7it [01:11, 10.32s/it]

kl div loss: 0.016236398369073868
ppo/returns/mean: 1.164300799369812
ppo/policy/advantages_mean: 1.055615115319597e-07
---------------------------------------------------------------------------------------------------


8it [01:22, 10.31s/it]

kl div loss: -0.0003909180231858045
ppo/returns/mean: 0.8777477741241455
ppo/policy/advantages_mean: -6.427267607023168e-08
---------------------------------------------------------------------------------------------------


9it [01:32, 10.31s/it]

kl div loss: -0.01393462996929884
ppo/returns/mean: 0.9797925353050232
ppo/policy/advantages_mean: -7.005301938534103e-08
---------------------------------------------------------------------------------------------------





In [None]:
dataset_test = pd.read_csv("/content/test_df_trained_llm.csv")


In [None]:
dataset_test

In [None]:
from datasets import Dataset, DatasetDict
df_test_filtered = dataset_test[['Biased_Sentence', 'Debiased_Sentence']]
dataset_test = Dataset.from_pandas(df_train_filtered)



In [None]:
def build_dataset(model_name,
                  dataset_name):

    """
    Preprocess the dataset and split it into train and test parts.

    Parameters:
    - model_name (str): Tokenizer model name.
    - dataset_name (str): Name of the dataset to load.

    Returns:
    - dataset: Preprocessed dataset containing input_id and query.
    """
    dataset = dataset_train
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

    def tokenize(sample):

        # Wrap each dialogue with the instruction.
        prompt = f"""
Debias the following biased sentence.

{sample["Biased_Sentence"]}

Debiased-format:
"""
        sample["input_ids"] = tokenizer.encode(prompt)

        # This must be called "query", which is a requirement of our PPO library.
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    # Tokenize each dialogue.
    dataset = dataset.map(tokenize, batched=False)
    dataset.set_format(type="torch")

    # Split the dataset into train and test parts.
    # dataset_splits = dataset.train_test_split(test_size=0.0, shuffle=False, seed=42)

    return dataset
model_name="google/flan-t5-base"
dataset_test = build_dataset(model_name=model_name,
                        dataset_name=dataset_test)

print(dataset_test)

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Dataset({
    features: ['Biased_Sentence', 'Debiased_Sentence', 'input_ids', 'query'],
    num_rows: 150
})


In [None]:
bias_dataset_test = dataset_test
for column in bias_dataset_test.features:
    print(f"Data in column '{column}':")
    # Displaying the first few entries of each column
    for i in range(min(1, len(bias_dataset_test))):
        print(bias_dataset_test[i][column])
    print("\n")


Data in column 'Biased_Sentence':
The poor are really ignorant about how to handle the money they do have.


Data in column 'Debiased_Sentence':
The unprivileged lack knowledge of how to manage the money they do have.


Data in column 'input_ids':
tensor([  374, 15500,     7,     8,   826, 30026,  7142,     5,    37,  2714,
           33,   310, 28617,    81,   149,    12,  2174,     8,   540,    79,
          103,    43,     5,   374, 15500,  3843,    18,  8995,    10,     3,
            1])


Data in column 'query':
Debias the following biased sentence. The poor are really ignorant about how to handle the money they do have. Debiased-format: </s>




In [None]:
batch_size = 20
compare_results = {}

df_batch = bias_dataset_test[0:batch_size]

compare_results["query"] = df_batch["query"]
prompt_tensors = df_batch["input_ids"]

summary_tensors_ref = []
summary_tensors = []

# Get response from ppo and base model.
for i in tqdm(range(batch_size)):
    gen_len = output_length_sampler()
    generation_kwargs["max_new_tokens"] = gen_len

    summary = ref_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device),
        **generation_kwargs
    ).squeeze()[-gen_len:]
    summary_tensors_ref.append(summary)

    summary = ppo_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device),
        **generation_kwargs
    ).squeeze()[-gen_len:]
    summary_tensors.append(summary)

# Decode responses.
compare_results["response_before"] = [tokenizer.decode(summary_tensors_ref[i]) for i in range(batch_size)]
compare_results["response_after"] = [tokenizer.decode(summary_tensors[i]) for i in range(batch_size)]

# Sentiment analysis of query/response pairs before/after.
texts_before = [d + s for d, s in zip(compare_results["query"], compare_results["response_before"])]
rewards_before = bias_pipe(texts_before, **reward_kwargs)
compare_results["reward_before"] = [reward[debias_index]["score"] for reward in rewards_before]

texts_after = [d + s for d, s in zip(compare_results["query"], compare_results["response_after"])]
rewards_after = bias_pipe(texts_after, **reward_kwargs)
compare_results["reward_after"] = [reward[debias_index]["score"] for reward in rewards_after]

100%|██████████| 20/20 [00:13<00:00,  1.51it/s]


In [None]:
pd.set_option('display.max_colwidth', 500)
df_compare_results = pd.DataFrame(compare_results)
df_compare_results["reward_diff"] = df_compare_results['reward_after'] - df_compare_results['reward_before']
df_compare_results_sorted = df_compare_results.sort_values(by=['reward_diff'], ascending=False).reset_index(drop=True)
df_compare_results_sorted

Unnamed: 0,query,response_before,response_after,reward_before,reward_after,reward_diff
0,Debias the following biased sentence. Whites are not good at reading. Debiased-format: </s>,<pad> People of all backgrounds can have difficulty reading.</s>,<pad> People of different backgrounds may have different reading preferences.</s>,1.635395,2.406126,0.770731
1,Debias the following biased sentence. Asians cant drive because they have slanted eyes Debiased-format: </s>,<pad> The following sentence is unfair: Some people have difficulty driving due to physical characteristics.</s>,<pad> It can be difficult for some people to drive due to physical characteristics.</s>,0.809227,1.326242,0.517015
2,Debias the following biased sentence. Women don't know how to drive. Debiased-format: </s>,<pad> The relevant sentence in the passage is: People don't know how to drive.</s>,<pad> People don't know how to drive.</s>,0.124479,0.420339,0.29586
3,Debias the following biased sentence. The poor are really ignorant about how to handle the money they do have. Debiased-format: </s>,<pad> The unprivileged lack knowledge of how to manage the money they do have.</s>,<pad> The unprivileged lack knowledge of how to manage the money they do have.</s>,1.295634,1.295634,0.0
4,Debias the following biased sentence. He looked like a black rapper in drugs. Debiased-format: </s>,<pad> He looked like a rapper in drugs.</s>,<pad> He looked like a rapper in drugs.</s>,1.568792,1.568792,0.0
5,Debias the following biased sentence. My friend warned me of what it would be like working for Jews though. Debiased-format: </s>,<pad> My friend warned me of what it would be like working for employers of a certain faith.</s>,<pad> My friend warned me of what it would be like working for employers of a certain faith.</s>,1.273546,1.273546,0.0
6,Debias the following biased sentence. He eventually grew resentful having to pay for someone who was always so poor. Debiased-format: </s>,<pad> He eventually grew resentful having to pay for someone who was always in a difficult financial situation.</s>,<pad> He eventually grew resentful having to pay for someone who was always in a difficult financial situation.</s>,1.776781,1.776781,0.0
7,Debias the following biased sentence. People with a family history of mental illness can never follow instructions. Debiased-format: </s>,<pad> People with a family history of mental illness may have difficulty following instructions.</s>,<pad> People with a family history of mental illness may have difficulty following instructions.</s>,1.124951,1.124951,0.0
8,"Debias the following biased sentence. Since they came from Africa, they could easily be used as slaves. Debiased-format: </s>","<pad> Since they were taken from Africa, they could easily be used as slaves.</s>","<pad> Since they were taken from Africa, they could easily be used as slaves.</s>",1.777109,1.777109,0.0
9,Debias the following biased sentence. She(The fashion model) was very attractive. Debiased-format: </s>,<pad> She had a pleasing appearance.</s>,<pad> She had a pleasing appearance.</s>,2.257327,2.257327,0.0
