# Lightweight Fine-Tuning Project

* PEFT technique: Progressive Embedding Fine-Tuning (PEFT)
* Model: GPT-2
* Evaluation approach: Train-Test split
* Fine-tuning dataset: Amazon Polarity

# Import Libraries

In [None]:
!pip install -q numpy pandas datasets transformers scikit-learn torch peft

In [None]:
import torch

import numpy as np
import pandas as pd

from datasets import load_dataset

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, Trainer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from peft import AutoPeftModelForSequenceClassification, LoraConfig, get_peft_model

In [None]:
!pip freeze > requirements.txt

# Auxiliar Functions

In [None]:
def dataset_info(dataset, subset, num_examples=1):
    """
    The basic information about the Dataset

    Args:
        dataset: Dataset Object

    Returns:
        None
    """
    print(f"=== {subset.upper()} INFO ===")
    print("Size", len(dataset[subset]))
    print("Features:", list(dataset[subset].features.keys()))

    min_length = min(len(content) for content in dataset[subset]["content"])
    max_length = max(len(content) for content in dataset[subset]["content"])

    print(f"\nMin length of content: {min_length}")
    print(f"Max length of content: {max_length}\n")

    labels = set(dataset[subset]["label"])

    print("Labels:", labels)

    frequencies = {x: dataset[subset]["label"].count(x) for x in set(dataset[subset]["label"])}
    percentages = {x: (count / dataset[subset].num_rows) * 100 for x, count in frequencies.items()}

    for key, value in percentages.items():
        print(f"- Label {key}: {frequencies[key]} --- {value:.2f}%")

    print("\n=== EXAMPLE ===")

    for i in range(num_examples):
      print(f"Title: {dataset[subset]['title'][i]} --- Label: {dataset[subset]['label'][i]}")
      print(f"{dataset[subset]['content'][i]}\n")

In [None]:
def tokenize(input):
    return tokenizer(input["content"], truncation=True, padding=True)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1) if isinstance(predictions, np.ndarray) else np.argmax(predictions, axis=1)

    return {
        "accuracy": accuracy_score(labels, predictions),
        "precision": precision_score(labels, predictions),
        "recall": recall_score(labels, predictions),
        "f1": f1_score(labels, predictions),
    }

# Load Dataset

In [None]:
dataset = load_dataset("amazon_polarity")

train_ds = dataset["train"]
test_ds = dataset["test"]

In [None]:
dataset_info(dataset, "train")

In [None]:
dataset_info(dataset, "test")

# Loading and Evaluating a Foundation Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained("gpt2").to(device)
model.config.pad_token_id = tokenizer.pad_token_id

for param in model.base_model.parameters():
    param.requires_grad = False

In [None]:
preprocessed_train_ds = train_ds.map(tokenize, batched=True)
preprocessed_test_ds = test_ds.map(tokenize, batched=True)

In [None]:
preprocessed_train_ds.set_format(type="torch", columns=["label", "title", "content", "input_ids", "attention_mask"])
preprocessed_test_ds.set_format(type="torch", columns=["label", "title", "content", "input_ids", "attention_mask"])

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_train_ds,
    eval_dataset=preprocessed_test_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
evaluation_result = trainer.evaluate(preprocessed_test_ds)

evaluation_result

# Performing Parameter-Efficient Fine-Tuning

In [None]:
config = LoraConfig()

In [None]:
lora_model = get_peft_model(model, config).to(device)

lora_model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=preprocessed_train_ds,
    eval_dataset=preprocessed_test_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
lora_model.save_pretrained("gpt-lora")

# Performing Inference with a PEFT Model

In [None]:
lora_model = AutoPeftModelForSequenceClassification.from_pretrained("gpt-lora")
lora_model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=preprocessed_train_ds,
    eval_dataset=preprocessed_test_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
evaluation_result = trainer.evaluate(preprocessed_test_ds)

evaluation_result