In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
import torch
import pandas as pd
import os
import time

import warnings
warnings.filterwarnings('ignore')




In [2]:
ft_summary = []

def reset_vram_tracking():
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.empty_cache()

def get_vram_usage():
    if torch.cuda.is_available():
        return round(torch.cuda.max_memory_reserved() / (1024 ** 3), 2)  # More accurate across forward/backward
    return "N/A"

def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_model_size(path):
    total = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total += os.path.getsize(fp)
    return round(total / (1024**2), 2)

def log_ft_summary(method_name, model, precision, model_dir, train_time, train_loss):
    vram = get_vram_usage()
    params = count_trainable_parameters(model)
    model_size = get_model_size(model_dir)
    ft_summary.append({
        "Method": method_name,
        "Precision": precision,
        "Trainable Parameters": params,
        "VRAM Used (GB)": vram,
        "Train Time (s)": train_time,
        "Model Size (MB)": model_size
    })

In [3]:
model_name = "google/flan-t5-base"
task = "summarization"
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
def preprocess(examples):
    inputs = ["summarize: " + article for article in examples["article"]]
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], truncation=True, padding="max_length", max_length=128)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [5]:
dataset = dataset.map(preprocess, batched=True)

In [6]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Full Parameter Fine Tuning

In [7]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./direct_finetune",
    per_device_train_batch_size=1,
    learning_rate=5e-5,
    num_train_epochs=1,
    logging_steps=5,
    save_total_limit=1,
    fp16=True,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)

reset_vram_tracking()
start_time = time.time()
output = trainer.train()
train_loss = output.training_loss

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
5,0.0
10,0.0
15,0.0
20,0.0
25,0.0
30,0.0
35,0.0
40,0.0
45,0.0
50,0.0


In [8]:
train_time = round(time.time() - start_time, 2)
model.save_pretrained("model")

log_ft_summary("Full Parameter", model, "fp16", "model", train_time, train_loss)

# Fine-Tuning using PEFT & LoRA

In [11]:
from peft import get_peft_model, TaskType, LoraConfig

In [12]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none"
)

In [13]:
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561


In [14]:
trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=peft_model)
)

reset_vram_tracking()
start_time = time.time()
output = trainer.train()
train_loss = output.training_loss

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
5,0.0
10,0.0
15,0.0
20,0.0
25,0.0
30,0.0
35,0.0
40,0.0
45,0.0
50,0.0


In [15]:
train_time = round(time.time() - start_time, 2)
peft_model.save_pretrained("model_lora")

log_ft_summary("PEFT + LoRA", peft_model, "fp16", "model_lora", train_time, train_loss)

# Fine-Tuning using QLoRA

In [16]:
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training

In [17]:
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

In [18]:
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

In [19]:
model = prepare_model_for_kbit_training(model)
peft_model = get_peft_model(model, peft_config)

In [20]:
trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=peft_model)
)

reset_vram_tracking()
start_time = time.time()
output = trainer.train()
train_loss = output.training_loss

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
5,0.0
10,0.0
15,0.0
20,0.0
25,0.0
30,0.0
35,0.0
40,0.0
45,0.0
50,0.0


In [21]:
train_time = round(time.time() - start_time, 2)
peft_model.save_pretrained("model_Qlora")

log_ft_summary("QLoRA", peft_model, "4-bit + fp16", "model_Qlora", train_time, train_loss)

# Comparision of above used Methods

In [22]:
df = pd.DataFrame(ft_summary)
print(df)

           Method     Precision  Trainable Parameters  VRAM Used (GB)  \
0  Full Parameter          fp16             247577856            2.81   
1     PEFT + LoRA          fp16                884736            2.81   
2           QLoRA  4-bit + fp16                884736            2.54   

   Train Time (s)  Model Size (MB)  
0          369.18           944.47  
1          349.46             3.40  
2          716.08             3.40  
