# Fine tuning a generative AI Model for Dialgue Summarization



%pip install --upgrade pip
%pip install --disable-pip-version-check \
   tokenizers==0.12.1 \
   torch==1.13.1+cu117 torchvision>=0.13.1+cu117 torchaudio>=0.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 --no-cache-dir \
   torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
hugginface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(hugginface_dataset_name)

dataset

Found cached dataset csv (C:/Users/david/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c8fac5d84cd35861/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})

In [3]:
model_name= 'google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params} \nall model parameters: {all_model_params}\npercentageof trainable model parameter: {((trainable_model_params/all_model_params)*100)}%"


print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856 
all model parameters: 247577856
percentageof trainable model parameter: 100.0%


In [5]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summaise the following conversation

{dialogue}

Summary:
"""

inputs = tokenizer(dialogue, return_tensors='pt')
output = tokenizer.decode(
        original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))

print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')


---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summaise the following conversation

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

---------------------------------------------------------------------

In [6]:
def tokenize_function(example):
    start_prompt = 'Summarise the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors='pt').input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors='pt').input_ids

    return example

 
# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenizer_function code is handling all data accross all split in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])


Loading cached processed dataset at C:\Users\david\.cache\huggingface\datasets\knkarthick___csv\knkarthick--dialogsum-c8fac5d84cd35861\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-6a3be39e9e86e63d.arrow


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Loading cached processed dataset at C:\Users\david\.cache\huggingface\datasets\knkarthick___csv\knkarthick--dialogsum-c8fac5d84cd35861\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-b051e2995756c776.arrow


In [7]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

Loading cached processed dataset at C:\Users\david\.cache\huggingface\datasets\knkarthick___csv\knkarthick--dialogsum-c8fac5d84cd35861\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-ce129ebb82cb5839.arrow


Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

Loading cached processed dataset at C:\Users\david\.cache\huggingface\datasets\knkarthick___csv\knkarthick--dialogsum-c8fac5d84cd35861\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-b6daf72a11c29c75.arrow


In [8]:
print(f"Shapes of the dataset:")
print(f"Training:{tokenized_datasets['train'].shape}")
print(f"Validation:{tokenized_datasets['validation'].shape}")
print(f"Test:{tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the dataset:
Training:(125, 2)
Validation:(5, 2)
Test:(15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
})


In [9]:
import torch
from tqdm import tqdm

Output directory for fine tunned LLM

In [10]:
output_dir = f"./flan-dialogue-summary-checkpoint"

Options to training on CUDA GPU

In [11]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

Verify Cuda is available

In [12]:
torch.cuda.is_available()

True



https://pytorch.org/docs/stable/generated/torch.zeros.html

In [13]:
torch.zeros(1).cuda()

tensor([0.], device='cuda:0')

In [14]:
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=100,
    per_device_train_batch_size=4,  # Set the batch size according to your GPU memory
    per_device_eval_batch_size=4,  # Set the batch size according to your GPU memory
    gradient_accumulation_steps=8,  # Accumulate gradients for larger effective batch size
    evaluation_strategy="steps",
    eval_steps=100,  # Evaluate every 100 steps
    save_strategy="steps",
    save_steps=100,  # Save checkpoint every 100 steps
    report_to="none",  # Disable logging
    disable_tqdm=True,  # Disable tqdm progress bar
    fp16=False,  # Enable mixed-precision training
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)


Train the LLM

In [15]:
#progress_bar = tqdm(total=training_args.max_steps, desc="Training", dynamic_ncols=True, no_deprecation_warning=True )
trainer.train(resume_from_checkpoint=None)
#progress_bar.close()

{'loss': 49.5312, 'learning_rate': 9.9e-06, 'epoch': 0.25}
{'loss': 50.375, 'learning_rate': 9.800000000000001e-06, 'epoch': 0.5}
{'loss': 49.5625, 'learning_rate': 9.7e-06, 'epoch': 0.75}
{'loss': 48.7188, 'learning_rate': 9.600000000000001e-06, 'epoch': 1.0}
{'loss': 50.0, 'learning_rate': 9.5e-06, 'epoch': 1.25}
{'loss': 49.0625, 'learning_rate': 9.4e-06, 'epoch': 1.5}
{'loss': 48.9688, 'learning_rate': 9.3e-06, 'epoch': 1.75}
{'loss': 48.8438, 'learning_rate': 9.200000000000002e-06, 'epoch': 2.0}
{'loss': 49.375, 'learning_rate': 9.100000000000001e-06, 'epoch': 2.25}
{'loss': 48.25, 'learning_rate': 9e-06, 'epoch': 2.5}
{'loss': 48.5938, 'learning_rate': 8.900000000000001e-06, 'epoch': 2.75}
{'loss': 49.1875, 'learning_rate': 8.8e-06, 'epoch': 3.0}
{'loss': 49.2188, 'learning_rate': 8.700000000000001e-06, 'epoch': 3.25}
{'loss': 48.2188, 'learning_rate': 8.6e-06, 'epoch': 3.5}
{'loss': 48.1875, 'learning_rate': 8.5e-06, 'epoch': 3.75}
{'loss': 48.8125, 'learning_rate': 8.4000000000

TrainOutput(global_step=100, training_loss=47.5525, metrics={'train_runtime': 332.3613, 'train_samples_per_second': 9.628, 'train_steps_per_second': 0.301, 'train_loss': 47.5525, 'epoch': 25.0})

Save the trained Model

In [16]:

trainer.save_model(output_dir)

Load the saved model

In [19]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("./flan-dialogue-summary-checkpoint", torch_dtype=torch.bfloat16)
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

In [20]:
index = 200

dialogues = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summaise the following conversation

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Remove # if you have the ability to run both models
original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, do_sample=True, tempature=0.1, num_beams=1))
original_model_text_outputs = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, do_sample=True, tempature=0.1, num_beams=1))
instruct_model_text_outputs = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)


print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')

# Remove # if you have the ability to run both models
print(dash_line)
print(f'Original Model:\n{original_model_text_outputs}\n')

print(dash_line)
print(f'Instruct Model:\n{instruct_model_text_outputs}')
print(dash_line)


---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
Oringinal Model:
What is your system's hardware requirements?

---------------------------------------------------------------------------------------------------
Instruct Model:
#Person1#: You need to upgrade your system and some hardware, and possibly some software that will allow you to run your own business. We've tried other parts but are not sure which one is best for you.
---------------------------------------------------------------------------------------------------


In [21]:
rouge = evaluate.load('rouge')

In [22]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
    
Summaise the following conversation

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
original_model_text_outputs = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
original_model_summaries.append(original_model_text_outputs)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
instruct_model_text_outputs = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
instruct_model_summaries.append(instruct_model_text_outputs)


zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))

df = pd.DataFrame(zipped_summaries, columns= ['human_baseline_summary', 'original_model_summaries', 'instruct_model_summaries'])
df

Unnamed: 0,human_baseline_summary,original_model_summaries,instruct_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,"#Person1#: Happy birthday, Brian. #Person2#: I...","#Person1#: Happy birthday, Brian. #Person2#: I..."


In [26]:
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

instruct_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

print('ORIGINAL MODEL: ')
print(original_model_results)
print('INSTRUCT MODEL: ')
print(instruct_model_results)

ORIGINAL MODEL: 
{'rouge1': 0.05940594059405941, 'rouge2': 0.0, 'rougeL': 0.039603960396039604, 'rougeLsum': 0.039603960396039604}
INSTRUCT MODEL: 
{'rouge1': 0.05940594059405941, 'rouge2': 0.0, 'rougeL': 0.039603960396039604, 'rougeLsum': 0.039603960396039604}
