# Install

In [1]:
# %pip install torchdata
# %pip install datasets
# %pip install evaluate
# %pip install rouge_score
# %pip install loralib
# %pip install peft

In [12]:
# import os
# os.environ(TOKENIZERS_PARALLELISM=False)

# Imports

In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer

import torch
import pandas as pd
import numpy as np
import time
import evaluate

# Summarize Dialogue data without prompt engineering

Here we will be generating a summary of a dialogue with LLM (FLAN-T5) from HuggingFace.
Here we gonna use DialogSum dataset from HuggingFace dataset.

## Load dataset

In [2]:
dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(dataset_name)

Print some dialogue with their base line summaries

In [3]:
exa_indices = [40, 200]
dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(exa_indices):
    print(dash_line)
    print("Example: ", i + 1)
    print(dash_line)
    print("INPUT_DIALOGUE: ")
    print(dataset['test'][index]['dialogue'])
    print(dash_line)
    print("BASELINE HUMAN SUMMARY: ")
    print(dataset['test'][index]['summary'])

---------------------------------------------------------------------------------------------------
Example:  1
---------------------------------------------------------------------------------------------------
INPUT_DIALOGUE: 
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY: 
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------
Example:  2
-----------------------------------------------------------------------------------------

## Load LLM

In [4]:
model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
# testing tokenizer
sentence = " I have always liked you!"
sentence_encoded = tokenizer(sentence, return_tensors='pt')
sentence_decoded = tokenizer.decode(sentence_encoded['input_ids'][0], skip_special_tokens=True)
print("ENCODED SENTENCE")
print(sentence_encoded['input_ids'][0])
print("\nSENTENCE DECODED")
print(sentence_decoded)

ENCODED SENTENCE
tensor([  27,   43,  373, 6528,   25,   55,    1])

SENTENCE DECODED
I have always liked you!


## Util Functions

In [6]:
def print_models_trainable_parameters(model):
    trainable_param = 0
    all_model_param = 0
    for _, param in model.named_parameters():
        all_model_param += param.numel()
        if param.requires_grad:
            trainable_param += param.numel()
    return f"trainable model parameters: {trainable_param}\nall model parameters: {all_model_param}\npercentage of trainable mdoel parameter: {100*trainable_param/all_model_param:.2f}%"
print(print_models_trainable_parameters(model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable mdoel parameter: 100.00%


From this we can see that here we have to train all the parameters. This is a full fine-tuning.<br>
Here we gonna do full fine-tuning and PEFT with LORa.<br>
Before moving into Full Fine-Tuning lets do a zero short first to test our FLAN-T5 performance before doing any
sort of fine-tuning.

## Zero Short In-Context Learning

In [7]:
index = 200
dialogue = dataset["test"][index]["dialogue"]
summary = dataset["test"][index]["summary"]

prompt = f"""
Summarize the following conversation.
{dialogue}

Summary: 
"""

In [8]:
inputs = tokenizer(prompt, return_tensors="pt")
outputs = tokenizer.decode(
    model.generate(inputs["input_ids"], max_new_tokens=200,)[0],
    skip_special_tokens=True
)

In [9]:
dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{outputs}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary: 

-------------------------------------------------------------------

As you can see its not able to give satisfied results.

## Data Preprocessing

Here we need to do add a instruction to the start of the dialogue for LLM to understand.
Example

Summarize the following conversation.

    Chris: This is his part of the conversation.
    Antje: This is her part of the conversation.

Summary:
Both Chris and Antje participated in the conversation.

In [10]:
def tokenize_fun(example):
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary: "
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example["input_ids"] = tokenizer(prompt, padding= "max_length",truncation=True,return_tensors='pt').input_ids
    example["labels"] = tokenizer(example["summary"], padding='max_length', truncation=True, return_tensors="pt").input_ids
    return example


tokenized_dataset = dataset.map(tokenize_fun, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["id", "topic", "dialogue", "summary"])

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [14]:
print("Shape of the dataset")
print(f"Training: {tokenized_dataset['train'].shape}")
print(f"Validation: {tokenized_dataset['validation'].shape}")
print(f"Test: {tokenized_dataset['test'].shape}")

Shape of the dataset
Training: (12460, 2)
Validation: (500, 2)
Test: (1500, 2)


In [15]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
})


## PEFT ith LoRA

In [6]:
from peft import LoraConfig, get_peft_model, TaskType

In [7]:
lora_config = LoraConfig(
    r=32, # rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias = "none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [11]:
peft_model = get_peft_model(model,lora_config)
print(print_models_trainable_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable mdoel parameter: 1.41%


#### Train PEFT Adapter

In [26]:
output_dir = f"./peft-dialogue-summary-training-{str(int(time.time()))}"

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-5, # It's higher learning rate than full fine-tuning.
    num_train_epochs=5,
    logging_steps=1,
    max_steps=1
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_dataset["train"]
)

max_steps is given, it will override any value given in num_train_epochs


In [27]:
# Clear GPU cache
torch.cuda.empty_cache()
peft_trainer.train()

Step,Training Loss
1,34.7295


TrainOutput(global_step=1, training_loss=34.729515075683594, metrics={'train_runtime': 4.0867, 'train_samples_per_second': 0.979, 'train_steps_per_second': 0.245, 'total_flos': 2782515953664.0, 'train_loss': 34.729515075683594, 'epoch': 0.00032102728731942215})

In [28]:
index = 200
dialogue = dataset['test'][index]['dialogue']
baseline_human_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_ids = input_ids.to(device)
peft_model = peft_model.to(device)
peft_model_outputs = peft_model.generate(
    input_ids=input_ids,
    generation_config=GenerationConfig(max_new_tokens=200, num_beams=1)
)
# instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
# instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

# peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{baseline_human_summary}')
# print(dash_line)
# print(f'ORIGINAL MODEL:\n{original_model_text_output}')
# print(dash_line)
# print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
PEFT MODEL: #Person1: I'm not sure what you're looking for. #Person2: I'm not sure what exactly I'm looking for. #Person1: I'm not sure what you're talking about. #Person2: I'm not sure what exactly you're talking about. #Person1: I'm not sure what you're talking about. #Person2: I'm not sure what you're talking about. #Person1: I'm not sure what you're talking about. #Person2: I'm not sure what you're talking about. #Person1: I'm not sure what you're talking about. #Person1: I'm not sure what you're talking about. #Person2: I'm not sure what you're talking about.


## Full Fine-Tuning Process

In [13]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset["validation"]
)

max_steps is given, it will override any value given in num_train_epochs


In [14]:
# Clear GPU cache
torch.cuda.empty_cache()

In [29]:
# torch.cuda.memory_summary()




In [None]:
trainer.train()