In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
from trl import SFTTrainer, SFTConfig
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from peft import LoraConfig, get_peft_model, TaskType

In [2]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)

In [3]:
lora_config = LoraConfig(
    r=8,  # Reduced rank
    lora_alpha=8,  # Lower scaling factor
    #target_modules=["q"],  # Update fewer modules (e.g., only query weights)
    lora_dropout=0.1,  # Increased dropout for better regularization
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM  # Task type remains the same
)

In [4]:
model_name='google/flan-t5-small'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name,
torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
model.config.use_cache = False
model.config.pretraining_tp = 1

In [6]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [7]:
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 76961152
all model parameters: 76961152
percentage of trainable model parameters: 100.00%


In [8]:
peft_model = get_peft_model(model, 
                            lora_config)

In [9]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 344064
all model parameters: 77305216
percentage of trainable model parameters: 0.45%


In [10]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

In [11]:
# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

In [12]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 500 == 0, with_indices=True)

print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")
print(tokenized_datasets)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

Shapes of the datasets:
Training: (25, 2)
Validation: (1, 2)
Test: (3, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 25
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 3
    })
})


In [13]:
output_dir = f'./SumSmart-training'
peft_training_args = SFTConfig(
    output_dir=output_dir,
    max_seq_length=1024,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    # per_device_eval_batch_size=3,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_steps=1,
    # eval_strategy="epoch",
    optim="adamw_hf",
    #optim="adamw_8bit",
    bf16=True,
    )
peft_trainer = SFTTrainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
    )

  return torch._C._cuda_getDeviceCount() > 0


In [None]:
peft_trainer.train()

  ctx_manager = torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


In [None]:
peft_model_path="./SumSmart-checkpoint-local"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)