In [1]:
import numpy as np
import pandas as pd
import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training, PromptEmbedding, PromptTuningConfig
import torch
# trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

# I. Samsum dialogue dataset

In [2]:
train_data = pd.read_csv('samsum-train.csv')
validation_data = pd.read_csv('samsum-validation.csv')
test_data = pd.read_csv('samsum-test.csv')

In [3]:
print('Train shape:', train_data.shape)
print('Validation shape:', validation_data.shape)
print('Test shape:', test_data.shape)

Train shape: (14732, 3)
Validation shape: (818, 3)
Test shape: (819, 3)


In [5]:
train_data.describe()

Unnamed: 0,id,dialogue,summary
count,14732,14731,14732
unique,14732,14264,14730
top,13818513,"Jimmy: Hey, guess what? My car's completely ka...",Seth's pet Oreo that he got when he was 10 is ...
freq,1,4,2


In [6]:
validation_data.describe()

Unnamed: 0,id,dialogue,summary
count,818,818,818
unique,818,818,818
top,13817023,"A: Hi Tom, are you busy tomorrow’s afternoon?\...",A will go to the animal shelter tomorrow to ge...
freq,1,1,1


In [7]:
test_data.describe()

Unnamed: 0,id,dialogue,summary
count,819,819,819
unique,819,819,819
top,13862856,"Hannah: Hey, do you have Betty's number?\nAman...",Hannah needs Betty's number but Amanda doesn't...
freq,1,1,1


In [8]:
print('-'*50)
print(f'Dialogue')
print('-'*50)
print(train_data.iloc[0]['dialogue'])
print('-'*50)
print('Summary')
print('-'*50)
print(train_data.iloc[0]['summary'])

--------------------------------------------------
Dialogue
--------------------------------------------------
Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)
--------------------------------------------------
Summary
--------------------------------------------------
Amanda baked cookies and will bring Jerry some tomorrow.


# II. Load model

In [4]:
model_name = 'google/flan-t5-small'
reference_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [5]:
# tokenizer to encode and decode text
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [13]:
# test the tokenizer on sample text
sample_text = 'Hello my name is Blavo'
encoded_text = tokenizer(sample_text, return_tensors='tf')['input_ids'][0]
decoded_text = tokenizer.decode(
        encoded_text, 
        skip_special_tokens=True
    )
print('Original text:', sample_text)
print('Encoded text:', encoded_text)
print('Decoded text:', decoded_text)

Original text: Hello my name is Blavo
Encoded text: tf.Tensor([8774   82  564   19 6942 1621    1], shape=(7,), dtype=int32)
Decoded text: Hello my name is Blavo


# III. Some text sumarization to see how the base original model is performing

In [14]:
indices = [1, 10]
for i, index in enumerate(indices):
    dialogue = test_data.iloc[index]['dialogue']
    summary = test_data.iloc[index]['summary']
    inputs = tokenizer(dialogue, return_tensors='pt')['input_ids']
    output = tokenizer.decode(
        reference_model.generate(
            inputs, 
            max_new_tokens=50,
        )[0], 
        skip_special_tokens=True
    )
    print('-'*50)
    print(f'Example {i+1}')
    print('-'*50)
    print(dialogue)
    print('Human summary:', summary)
    print('Model summary:', output)

--------------------------------------------------
Example 1
--------------------------------------------------
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)
Human summary: Eric and Rob are going to watch a stand-up on youtube.
Model summary: Rob will watch some stand-ups on YouTube.
--------------------------------------------------
Example 2
--------------------------------------------------
Wanda: Let's make a party!
Gina: Why?
Wanda: beacuse. I want some fun!
Gina: ok, what do u need?
Wanda: 1st I need too make a list
Gina: noted and then?
Wanda: well, could u take yours father car and go d

# IV. In-context learning

In [15]:
def make_prompt(example_indices, index_to_summarize):
    prompt = ''
    for index in example_indices:
        dialogue = test_data.iloc[index]['dialogue']
        summary = test_data.iloc[index]['summary']
        prompt += f"""
        Dialogue:
        {dialogue}
        what is going on?
        {summary}
        """
    dialogue = test_data.iloc[index_to_summarize]['dialogue']
    prompt += f"""
    Dialogue:
    {dialogue}
    What is going on?
    """ 
    return prompt

## 1. Zero shot inference with prompt template

In [16]:
def in_context_learning(indices, index):
    zero_shot_prompt = make_prompt(indices, index)
    inputs = tokenizer(zero_shot_prompt, return_tensors='pt')['input_ids']
    human_summary = test_data.iloc[index]['summary']
    output = tokenizer.decode(
        reference_model.generate(
            inputs, 
            max_new_tokens=50,
        )[0], 
        skip_special_tokens=True
    )
    print('Human sumary:', human_summary)
    print('Model summary:', output)

In [17]:
in_context_learning([], 10)

Human sumary: Wanda wants to throw a party. She asks Gina to borrow her father's car and go do groceries together. They set the date for Friday. 
Model summary: Wanda and Gina will make a party on Friday.


## 2. One shot inference with template prompt

In [18]:
in_context_learning([3], 10)

Human sumary: Wanda wants to throw a party. She asks Gina to borrow her father's car and go do groceries together. They set the date for Friday. 
Model summary: Wanda and Gina will make a party on Friday.


## 3. Few shot inference with template prompt

In [19]:
in_context_learning([6, 1], 10)

Token indices sequence length is longer than the specified maximum sequence length for this model (730 > 512). Running this sequence through the model will result in indexing errors


Human sumary: Wanda wants to throw a party. She asks Gina to borrow her father's car and go do groceries together. They set the date for Friday. 
Model summary: Wanda and Gina will make a party on Friday.


# V. Fine-tuning of the FLAN-T5 model on the samsum dataset 

In [6]:
# evaluate predictions
rouge = evaluate.load("rouge")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# Display the total number of parameters and the number of trainable parameters
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

# tokenize the dataset
def tokenize_dataset(data):
    start_prompt = "Summarize the following conversation. \n\n"
    end_prompt = "\n\nSummary: "
    inputs = [start_prompt + str(dialogue) + end_prompt for dialogue in data["dialogue"]]
    data['input_ids'] = tokenizer(inputs, padding="max_length", truncation=True).input_ids

    labels = tokenizer(text_target=data["summary"], padding="max_length", truncation=True)

    data["labels"] = labels["input_ids"]
    return data

In [21]:
print('-'*50)
print('Reference model')
print('-'*50)
print(print_number_of_trainable_model_parameters(reference_model))

--------------------------------------------------
Reference model
--------------------------------------------------
trainable model parameters: 76961152
all model parameters: 76961152
percentage of trainable model parameters: 100.00%


## Preprocess dataset

In [7]:
# convert to huggingface Dataset
train_data = Dataset.from_pandas(train_data)
validation_data = Dataset.from_pandas(validation_data)
test_data = Dataset.from_pandas(test_data)

In [8]:
# tokenize the dataset
tokenized_train_data = train_data.map(tokenize_dataset, batched=True)
tokenized_validation_data = validation_data.map(tokenize_dataset, batched=True)
tokenized_test_data = test_data.map(tokenize_dataset, batched=True)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

In [9]:
tokenized_train_data = tokenized_train_data.remove_columns(['id', 'dialogue', 'summary'])
tokenized_validation_data = tokenized_validation_data.remove_columns(['id', 'dialogue', 'summary'])
tokenized_test_data = tokenized_test_data.remove_columns(['id', 'dialogue', 'summary'])

In [10]:
tokenized_train_data.set_format(type='torch')
tokenized_validation_data.set_format(type='torch')
tokenized_test_data.set_format(type='torch')

## 1. Fine-tune by unfreezing some layers of the reference model

In [31]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

In [18]:
selective_model = reference_model

# freeze all weights
for param in selective_model.parameters():
    param.requires_grad=False

In [19]:
# Unfreeze the weights of the last decoder layer
for param in selective_model.decoder.block[-1].layer[-1].layer_norm.parameters():
    param.requires_grad = True

In [30]:
print('-'*50)
print('Selective model')
print('-'*50)
print(print_number_of_trainable_model_parameters(selective_model))

--------------------------------------------------
Selective model
--------------------------------------------------
trainable model parameters: 512
all model parameters: 76961152
percentage of trainable model parameters: 0.00%


In [31]:

training_args = Seq2SeqTrainingArguments(
    output_dir="dialogue-summary-training-selective-fine-tuning",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)

selective_trainer = Seq2SeqTrainer(
    model=selective_model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_validation_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [32]:
selective_trainer.train()

  0%|          | 0/1 [00:00<?, ?it/s]

{'loss': 62.0058, 'learning_rate': 0.0, 'epoch': 0.0}
{'train_runtime': 176.4862, 'train_samples_per_second': 0.181, 'train_steps_per_second': 0.006, 'train_loss': 62.005760192871094, 'epoch': 0.0}


TrainOutput(global_step=1, training_loss=62.005760192871094, metrics={'train_runtime': 176.4862, 'train_samples_per_second': 0.181, 'train_steps_per_second': 0.006, 'train_loss': 62.005760192871094, 'epoch': 0.0})

In [23]:
selective_model_path="./selective-dialogue-summary-checkpoint-local"

selective_trainer.model.save_pretrained(selective_model_path)
tokenizer.save_pretrained(selective_model_path)

('./selective-dialogue-summary-checkpoint-local\\tokenizer_config.json',
 './selective-dialogue-summary-checkpoint-local\\special_tokens_map.json',
 './selective-dialogue-summary-checkpoint-local\\tokenizer.json')

## 2. PEFT

### Quantize the model

In [34]:
config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [35]:
quantized_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, quantization_config=config)

In [36]:
# preprocess the quantized model for training
quantized_model = prepare_model_for_kbit_training(quantized_model, config)

In [37]:
print_number_of_trainable_model_parameters(quantized_model)

'trainable model parameters: 0\nall model parameters: 76961152\npercentage of trainable model parameters: 0.00%'

### 1. LoRA

In [26]:
lora_config = LoraConfig(
    r=4, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [27]:
original_model = quantized_model
peft_model = get_peft_model(original_model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 172032
all model parameters: 77133184
percentage of trainable model parameters: 0.22%


In [28]:
training_args = Seq2SeqTrainingArguments(
    output_dir="dialogue-summary-training-peft-LoRA-fine-tuning",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1,
)

peft_lora_trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_validation_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [29]:
peft_lora_trainer.train()

  0%|          | 0/1 [00:00<?, ?it/s]

{'loss': 62.1314, 'learning_rate': 0.0, 'epoch': 0.0}
{'train_runtime': 177.7116, 'train_samples_per_second': 0.18, 'train_steps_per_second': 0.006, 'train_loss': 62.13142776489258, 'epoch': 0.0}


TrainOutput(global_step=1, training_loss=62.13142776489258, metrics={'train_runtime': 177.7116, 'train_samples_per_second': 0.18, 'train_steps_per_second': 0.006, 'train_loss': 62.13142776489258, 'epoch': 0.0})

In [30]:
peft_lora_model_path="./peft-lora-dialogue-summary-checkpoint-local"

peft_lora_trainer.model.save_pretrained(peft_lora_model_path)
tokenizer.save_pretrained(peft_lora_model_path)

('./peft-lora-dialogue-summary-checkpoint-local\\tokenizer_config.json',
 './peft-lora-dialogue-summary-checkpoint-local\\special_tokens_map.json',
 './peft-lora-dialogue-summary-checkpoint-local\\tokenizer.json')

### 2. Prompt tuning

In [34]:
config = PromptTuningConfig(
    peft_type="PROMPT_TUNING",
    task_type="SEQ_2_SEQ_LM",
    num_virtual_tokens=8,
    prompt_tuning_init="TEXT",
    prompt_tuning_init_text="Summarize the following conversation. \n\nDialogue: ",
    tokenizer_name_or_path=model_name,
)

In [36]:
peft_reference_model = quantized_model
peft_prompt_tuning = get_peft_model(peft_reference_model, config)

In [37]:
print(print_number_of_trainable_model_parameters(peft_prompt_tuning))

trainable model parameters: 8192
all model parameters: 76969344
percentage of trainable model parameters: 0.01%


In [38]:
training_args = Seq2SeqTrainingArguments(
    output_dir="dialogue-summary-training-peft-prompt-tuning",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1,
)

peft_prompt_tuning_trainer = Seq2SeqTrainer(
    model=peft_prompt_tuning,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_validation_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [41]:
peft_prompt_tuning_trainer.train()

  0%|          | 0/1 [00:00<?, ?it/s]

{'loss': 61.8854, 'learning_rate': 0.0, 'epoch': 0.0}
{'train_runtime': 147.6886, 'train_samples_per_second': 0.217, 'train_steps_per_second': 0.007, 'train_loss': 61.88544845581055, 'epoch': 0.0}


TrainOutput(global_step=1, training_loss=61.88544845581055, metrics={'train_runtime': 147.6886, 'train_samples_per_second': 0.217, 'train_steps_per_second': 0.007, 'train_loss': 61.88544845581055, 'epoch': 0.0})

In [42]:
peft_prompt_tuning_model_path="./peft-prompt_tuning-dialogue-summary-checkpoint-local"

peft_prompt_tuning_trainer.model.save_pretrained(peft_prompt_tuning_model_path)
tokenizer.save_pretrained(peft_prompt_tuning_model_path)

('./peft-prompt_tuning-dialogue-summary-checkpoint-local\\tokenizer_config.json',
 './peft-prompt_tuning-dialogue-summary-checkpoint-local\\special_tokens_map.json',
 './peft-prompt_tuning-dialogue-summary-checkpoint-local\\tokenizer.json')

## 3. RLHF (Reinforcement learning with Human Feedback)

In [34]:
# optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
# reference_model.compile(optimizer=optimizer)  
# metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=validation_dataset)
# callbacks = [metric_callback]
# # create a tensorflow dataset
# train_dataset = tf.data.Dataset.from_tensor_slices((tokenized_train_data['input_ids'], tokenized_train_data['labels']))
# validation_dataset = tf.data.Dataset.from_tensor_slices((tokenized_validation_data['input_ids'], tokenized_validation_data['labels']))

# train_dataset = train_dataset.batch(32).shuffle(buffer_size=1000).prefetch(tf.data.AUTOTUNE)
# validation_dataset = validation_dataset.batch(32).prefetch(tf.data.AUTOTUNE)