## Fine-Tune T5 Model for Dialogue Summarization

In [1]:
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, GenerationConfig, TrainingArguments , Trainer




## 1 - Load Dataset and LLM

In [2]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

In [3]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
original_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)

In [5]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


## 2 - Test the model with Zero shot inferencing 

In [6]:
dialogue = dataset['test']['dialogue'][0]
summary = dataset['test']['summary'][0]

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt').input_ids

generation_config = GenerationConfig(max_new_tokens=50)

response = original_model.generate(
        inputs, 
        generation_config=generation_config
)[0]

output = tokenizer.decode(
    response, 
    skip_special_tokens=True
)

print(f'INPUT PROMPT:\n{dialogue}')
print('------------------------------------------------')
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print('------------------------------------------------')
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

INPUT PROMPT:
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging in this office. It wastes too much time! Now, please continue wit

the model struggles to summarize the dialogue compared to the baseline summary

## 3 - Perform Full Fine-Tuning

#### 3-1 Preprocess the Dialog-Summary Dataset

Convert the dialog-summary (prompt-response) pairs into explicit instructions for the LLM

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [8]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary'])

#### 3.2 - Fine-Tune the Model with the Preprocessed Dataset

In [10]:
fully_fine_tuned_model = T5ForConditionalGeneration.from_pretrained("truocpham/flan-dialogue-summary-checkpoint", torch_dtype=torch.bfloat16)

In [11]:
dialogue = dataset['test']['dialogue'][200]
summary = dataset['test']['summary'][ 200]

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt').input_ids

generation_config = GenerationConfig(max_new_tokens=200, num_beams=1)

response = fully_fine_tuned_model.generate(
        inputs, 
        generation_config=generation_config
)[0]

output = tokenizer.decode(
    response, 
    skip_special_tokens=True
)

print(f'INPUT PROMPT:\n{dialogue}')
print('------------------------------------------------')
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print('------------------------------------------------')
print(f'FULLY FINE-TUNEDMODEL GENERATION:\n{output}')

INPUT PROMPT:
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.
------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

------------------------------------------------
FULLY F

#### 3.3 - Evaluate the Model with ROUGE Metric

In [12]:
rouge = evaluate.load('rouge')

In [13]:
dialogues = dataset['test']['dialogue'][0:10]
human_baseline_summaries = dataset['test']['summary'][0:10]

original_model_summaries = []
instruct_model_summaries = []

generation_config = GenerationConfig(max_new_tokens=200, num_beams=1)

for dialogue in dialogues:

    prompt = f"""
        Summarize the following conversation.
        
        {dialogue}
        
        Summary:
    """

    inputs = tokenizer(prompt, return_tensors='pt').input_ids

    original_response = original_model.generate(
            inputs, 
            generation_config=generation_config
    )[0]
    
    original_output = tokenizer.decode(
        original_response, 
        skip_special_tokens=True
    )

    Fine_tuned_response = fully_fine_tuned_model.generate(
        inputs, 
        generation_config=generation_config
    )[0]
    
    Fine_tuned_output = tokenizer.decode(
        Fine_tuned_response, 
        skip_special_tokens=True
    )
  
    original_model_summaries.append(original_output)
    instruct_model_summaries.append(Fine_tuned_output)

In [14]:
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries,
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries,
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)

ORIGINAL MODEL:
{'rouge1': 0.25311151811151816, 'rouge2': 0.1118160782099401, 'rougeL': 0.22820105820105818, 'rougeLsum': 0.22311151811151814}
INSTRUCT MODEL:
{'rouge1': 0.4021097742688077, 'rouge2': 0.17532658321476857, 'rougeL': 0.2895240360670338, 'rougeLsum': 0.28700398181601194}


In [15]:
print("Absolute percentage improvement of INSTRUCT MODEL over HUMAN BASELINE")

improvement = (np.array(list(instruct_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(instruct_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of INSTRUCT MODEL over HUMAN BASELINE
rouge1: 14.90%
rouge2: 6.35%
rougeL: 6.13%
rougeLsum: 6.39%


## 4 - Perform Parameter Efficient Fine-Tuning (PEFT/LoRA)

#### 4.1 - Setup the PEFT/LoRA model for Fine-Tuning

In [16]:
from peft import LoraConfig, get_peft_model, TaskType , PeftModel

In [17]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [18]:
peft_model = get_peft_model(original_model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


In [20]:
instruct_model_name='intotheverse/peft-dialogue-summary-checkpoint'

peft_model = PeftModel.from_pretrained(original_model,
                                       instruct_model_name, 
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [21]:
dialogue = dataset['test']['dialogue'][200]
summary = dataset['test']['summary'][ 200]

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt').input_ids

generation_config = GenerationConfig(max_new_tokens=200, num_beams=1)

response = peft_model.generate(
        input_ids=inputs, 
        generation_config=generation_config
)[0]

output = tokenizer.decode(
    response, 
    skip_special_tokens=True
)

print(f'INPUT PROMPT:\n{dialogue}')
print('------------------------------------------------')
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print('------------------------------------------------')
print(f'PEFT/LoRA FINE-TUNED MODEL GENERATION:\n{output}')

INPUT PROMPT:
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.
------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

------------------------------------------------
PEFT/Lo

In [22]:
dialogues = dataset['test']['dialogue'][0:10]
human_baseline_summaries = dataset['test']['summary'][0:10]

peft_instruct_model_summaries = []

generation_config = GenerationConfig(max_new_tokens=200, num_beams=1)

for dialogue in dialogues:

    prompt = f"""
        Summarize the following conversation.
        
        {dialogue}
        
        Summary:
    """

    inputs = tokenizer(prompt, return_tensors='pt').input_ids

    peft_response = peft_model.generate(
        input_ids=inputs, 
        generation_config=generation_config
    )[0]
    
    peft_output = tokenizer.decode(
        peft_response, 
        skip_special_tokens=True
    )
  
    peft_instruct_model_summaries.append(peft_output)

In [25]:
peft_instruct_model_results = rouge.compute(
    predictions=peft_instruct_model_summaries,
    references=human_baseline_summaries,
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('PEFT INSTRUCT MODEL:')
print(peft_instruct_model_results)

ORIGINAL MODEL:
{'rouge1': 0.25311151811151816, 'rouge2': 0.1118160782099401, 'rougeL': 0.22820105820105818, 'rougeLsum': 0.22311151811151814}
PEFT INSTRUCT MODEL:
{'rouge1': 0.37185752037136655, 'rouge2': 0.12021993961310115, 'rougeL': 0.27742463494487635, 'rougeLsum': 0.2750722521667799}


In [26]:
print("Absolute percentage improvement of PEFT INSTRUCT MODEL over HUMAN BASELINE")

improvement = (np.array(list(peft_instruct_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_instruct_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of PEFT INSTRUCT MODEL over HUMAN BASELINE
rouge1: 11.87%
rouge2: 0.84%
rougeL: 4.92%
rougeLsum: 5.20%
