<a href="https://colab.research.google.com/github/AliGreo/Text-Based-Projects/blob/main/dialogue_summarization_flant5_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
%%capture
!pip install peft evaluate datasets trl bitsandbytes

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer, GenerationConfig
import torch
import evaluate
from peft import LoraConfig
import pandas as pd
from datasets import load_dataset

In [3]:
dataset = load_dataset("knkarthick/dialogsum")

README.md:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/442k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [4]:
pd.DataFrame(dataset["train"])

Unnamed: 0,id,dialogue,summary,topic
0,train_0,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw...",get a check-up
1,train_1,"#Person1#: Hello Mrs. Parker, how have you bee...",Mrs Parker takes Ricky for his vaccines. Dr. P...,vaccines
2,train_2,"#Person1#: Excuse me, did you see a set of key...",#Person1#'s looking for a set of keys and asks...,find keys
3,train_3,#Person1#: Why didn't you tell me you had a gi...,#Person1#'s angry because #Person2# didn't tel...,have a girlfriend
4,train_4,"#Person1#: Watsup, ladies! Y'll looking'fine t...",Malik invites Nikki to dance. Nikki agrees if ...,dance
...,...,...,...,...
12455,train_12455,#Person1#: Excuse me. You are Mr. Green from M...,Tan Ling picks Mr. Green up who is easily reco...,pick up someone
12456,train_12456,#Person1#: Mister Ewing said we should show up...,#Person1# and #Person2# plan to take the under...,conference center
12457,train_12457,#Person1#: How can I help you today?\n#Person2...,#Person2# rents a small car for 5 days with th...,rent a car
12458,train_12458,#Person1#: You look a bit unhappy today. What'...,#Person2#'s mom lost her job. #Person2# hopes ...,job losing


In [5]:
model_name= 'google/flan-t5-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name,
                                              torch_dtype=torch.bfloat16,
                                              device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

## using The base Model to generate summaries.

In [8]:
prompt = f"""

summarise the following dialogue:{dataset["test"][0]["dialogue"]}\n

summary: \n\n

"""

ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)


output = model.generate(ids, max_new_tokens=5000)
print(output, "\n\n")
generated_summary = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"The original summary:\n {dataset['test'][0]['summary']}", "\n\n")
print(f"The generated summary:\n {generated_summary}")

tensor([[    0,    37, 22986,    19,    12,    36,  8308,    12,    66,  1652,
            57,    48,  3742,     5,     1]], device='cuda:0') 


The original summary:
 Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore. 


The generated summary:
 The memo is to be distributed to all employees by this afternoon.


## Fine Tune The Model for more accurate results.

In [9]:
import numpy as np

def tokenize_dataset(example):
    start_prompt = "summarize the following conversation:"
    end_prompt = "summary:\n"
    full_prompt = [start_prompt + ex + end_prompt for ex in example["dialogue"]]

    example["input_ids"] = tokenizer(full_prompt, max_length=512,padding="max_length", truncation=True, return_tensors="pt").input_ids
    example["labels"] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    example['labels'] = np.where(example['labels'] != -100, example['labels'], tokenizer.pad_token_id)

    return example

tokenized_dataset = dataset.map(tokenize_dataset, batched=True)

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [10]:
tokenized_dataset = tokenized_dataset.remove_columns(['id', 'topic', 'dialogue', 'summary',])

In [11]:
del dataset, ids, output, generated_summary

In [26]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8, # Matrix Rank
    lora_alpha=16,
    target_modules=["q", "v", "k","o"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

peft_model = get_peft_model(model, lora_config)

print(peft_model.print_trainable_parameters())

trainable params: 1,769,472 || all params: 249,347,328 || trainable%: 0.7096
None


In [16]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    output_dir="flan-t5-base-finetuned-dialouge-summarization",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    optim="paged_adamw_8bit",
    fp16=True,
    num_train_epochs=1,
    logging_steps=500,
    push_to_hub= True,
    report_to="none",
    run_name="dialogue_summarization"
)

trainer = Seq2SeqTrainer(
    model=peft_model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    processing_class=tokenizer
)

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [28]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,0.0
1000,0.0
1500,0.0
2000,0.0
2500,0.0
3000,0.0


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


TrainOutput(global_step=3115, training_loss=0.0, metrics={'train_runtime': 2000.0103, 'train_samples_per_second': 6.23, 'train_steps_per_second': 1.557, 'total_flos': 8599806903582720.0, 'train_loss': 0.0, 'epoch': 1.0})

In [None]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/AlyGreo/flan-t5-base-finetuned-dialouge-summarization/commit/5092bf3f077e6df87a3e844e5569664a1eb62639', commit_message='End of training', commit_description='', oid='5092bf3f077e6df87a3e844e5569664a1eb62639', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AlyGreo/flan-t5-base-finetuned-dialouge-summarization', endpoint='https://huggingface.co', repo_type='model', repo_id='AlyGreo/flan-t5-base-finetuned-dialouge-summarization'), pr_revision=None, pr_num=None)

In [35]:
model_id = "/content/flan-t5-base-finetuned-dialouge-summarization"
mytokenizer = AutoTokenizer.from_pretrained(model_id)

finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_id,
                                                        torch_dtype=torch.bfloat16,
                                                        device_map="auto")

In [40]:
output = finetuned_model.generate(input_ids=torch.tensor(tokenized_dataset['test'][1000]['input_ids']).view(1, -1).to(finetuned_model.device),
                                  max_new_tokens=50)

print(mytokenizer.decode(output[0], skip_special_tokens=True))

She has a hard time finding her way to the computer


In [39]:
output = finetuned_model.generate(input_ids=torch.tensor(tokenized_dataset['test'][500]['input_ids']).view(1, -1).to(finetuned_model.device),
                          max_new_tokens=50)

print(mytokenizer.decode(output[0], skip_special_tokens=True))

The teetering on the side of the road, David and his brother are going on a four day drive to Salt Lake City.
