In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

[0m

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through 

In [None]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

dataset



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})

In [None]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [None]:
dataset.shape

{'train': (12460, 4), 'test': (1500, 4), 'validation': (500, 4)}

In [None]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Get the data type of the first parameter
dtype = next(original_model.parameters()).dtype

# Print the data type
print(dtype)


torch.bfloat16


In [None]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    # Get the name of the GPU
    device = torch.cuda.get_device_name(0)
    print(f'T5 model is running on GPU: {device}')
else:
    print('T5 model is running on CPU')


T5 model is running on GPU: Tesla T4


In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


In [None]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

In [None]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=16, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [None]:
peft_model = get_peft_model(original_model,
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 1769472
all model parameters: 249347328
percentage of trainable model parameters: 0.71%


In [None]:
import time
from transformers import Trainer, TrainingArguments


peft_training_args = TrainingArguments(
    output_dir="/content/Flan-T5_Sum_peft_loRA",
    push_to_hub=True,
    push_to_hub_model_id='Text_Summariztion_Flan-5T_PEFT_loRA',
    push_to_hub_token='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',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    #weight_decay=0.01,
    num_train_epochs=10,
    logging_steps=1,
    #fp16=True,
    max_steps=100,
    evaluation_strategy="steps",  # Perform evaluation at each `eval_steps`
    eval_steps=10,  # Adjust this based on your preference
    save_steps=10,  # Save a checkpoint every 10 steps
    save_total_limit=3,  # Limit the total number of checkpoints to 3
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Ahmedhany216/Text_Summariztion_Flan-5T_PEFT_loRA into local empty directory.


In [None]:
# Start training
peft_trainer.train()



Step,Training Loss,Validation Loss
10,22.125,19.405001
20,4.5,4.46325
30,3.9531,3.650375
40,2.7188,1.571875
50,1.4375,0.825125
60,0.9492,0.461523
70,0.6406,0.349687
80,0.5547,0.31652
90,0.459,0.26818
100,0.3828,0.25891


TrainOutput(global_step=100, training_loss=5.830546875, metrics={'train_runtime': 1081.6363, 'train_samples_per_second': 0.185, 'train_steps_per_second': 0.092, 'total_flos': 138038634086400.0, 'train_loss': 5.830546875, 'epoch': 0.02})

In [None]:
# Push model to the Hugging Face Model Hub
peft_trainer.push_to_hub()

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 1.00/479M [00:00<?, ?B/s]

Upload file runs/Dec20_20-12-37_15b6fbf28874/events.out.tfevents.1703103166.15b6fbf28874.25331.0:   0%|       …

To https://huggingface.co/Ahmedhany216/Text_Summariztion_Flan-5T_PEFT_loRA
   59963ef..2b6088e  main -> main

   59963ef..2b6088e  main -> main

To https://huggingface.co/Ahmedhany216/Text_Summariztion_Flan-5T_PEFT_loRA
   2b6088e..f06c0ba  main -> main

   2b6088e..f06c0ba  main -> main



'https://huggingface.co/Ahmedhany216/Text_Summariztion_Flan-5T_PEFT_loRA/commit/2b6088ebebb876348a0f33aa5b1513b09623eb09'