# CMU notebook
Before you turn these assignments in, make sure everything runs as expected. Be sure to click **run all** (in the upper right).

Follow the guide and fill in any place that says `YOUR CODE/ANSWER HERE` or `TODO`.

# CMU Machine Learning with Large Datasets

## Homework 4 - Programming 2: Fine-tuning BERT

In [None]:
# Who did you collaborate with on this assignment?
# if no one, collaborators should contain an empty string,
# else list your collaborators below
# collaborators = [""]

In [None]:
try:
    collaborators
except:
    raise AssertionError("you did not list your collaborators, if any")

## **0. Preliminaries**

You do not need to modify this part.

In [None]:
# Run this cell on Colab to install the required packages
!pip install --quiet transformers datasets peft accelerate scikit-learn

In [None]:
# Import necessary libraries

import numpy as np
import math
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam

from transformers import BertTokenizerFast, BertForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

import unittest

In [None]:
# Fix the seed and define the device

seed = 1337
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

In [None]:
# Load and Preprocess the Dataset

dataset = load_dataset('SetFit/sst5')

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

encoded_dataset = dataset.map(lambda examples: tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128), batched=True)
encoded_dataset = encoded_dataset.rename_column('label', 'labels')
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

batch_size = 64

train_dataloader = DataLoader(encoded_dataset['train'], batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(encoded_dataset['validation'], batch_size=batch_size)

In [None]:
# Useful helper functions for you to use

def get_model_params_info(model):
    # Total number of parameters
    total_params = sum(p.numel() for p in model.parameters())

    # Number of trainable parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    # Portion of trainable to all parameters
    portion_trainable = trainable_params / total_params if total_params > 0 else 0

    print(f"Total parameters: {total_params}")
    print(f"Trainable parameters: {trainable_params}")
    print(f"Portion of trainable to all parameters: {portion_trainable:.4f}")

    return total_params, trainable_params, portion_trainable

def train_one_step(model, loss_fn, optimizer, dataloader, t="hf"):
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        if t == "hf":
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
        else:
            outputs = model(input_ids)
            loss = loss_fn(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        dataloader.set_postfix({'loss': loss.item()})

    return total_loss / len(train_dataloader)


def eval_model(model, dataloader, t='hf'):
    preds = []
    true_labels = []
    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            if t == 'hf':
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)
                predictions = predictions.cpu().numpy()
            else:
                logits = model(input_ids)
                logits = logits.detach().cpu().numpy()
                predictions = np.argmax(logits, axis=1).flatten()

            preds.extend(predictions)
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds, average='weighted')

    return accuracy, f1

## 1. Implemeting LoRA

<!-- Your task is to fine-tune the **Llama2** model for sentiment analysis using the **SST-5** dataset. This involves implementing and experimenting with various fine-tuning methods. You will need to track and compare the model's performance and memory usage across different methods.

You only need to modify this notebook. -->

Before proceeding with the implementation, please refer to the following resources for a deeper understanding of LoRA:

- **LoRA Paper**: [https://arxiv.org/abs/2106.09685](https://arxiv.org/abs/2106.09685)
- **Video Explainer**: [https://www.youtube.com/watch?v=DhRoTONcyZE](https://www.youtube.com/watch?v=DhRoTONcyZE)

It is recommended that you review these materials before beginning the implementation.

In this assignment, you should initialize A as:
$A = \frac{1}{\sqrt{r}} \cdot \mathcal{N}(0, 1)$ (you can use `torch.randn`), and B as zero matrix.



In [None]:
class MyLoraConfig:
    def __init__(self, r, target_modules, lora_alpha, lora_dropout):
        self.r = r
        self.target_modules = target_modules
        self.lora_alpha = lora_alpha
        self.lora_dropout = lora_dropout


class MyLoraLayer(nn.Module):
    def __init__(self, base_layer, r, lora_alpha, lora_dropout):
        super(MyLoraLayer, self).__init__()
        # TODO

        self.base_layer = ...
        self.r = ...
        self.lora_alpha = ...

        self.lora_A = nn.Linear(..., ..., bias=False)
        self.lora_B = nn.Linear(..., ..., bias=False)
        
        self.lora_dropout = nn.Dropout(p=lora_dropout, inplace=False)


    def forward(self, x):
        # TODO: Implement the forward pass logic.
        # Only apply dropout to the input x to LoRA.

        pass


In [None]:
# Test LoRA implementation.

class TestMyLoraLayer(unittest.TestCase):

    def test_init(self):
        seed = 1337
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        base_layer = nn.Linear(4, 5)
        lora_layer = MyLoraLayer(base_layer, r=2, lora_alpha=4, lora_dropout=0.1)
        
        # base check
        self.assertEqual(lora_layer.r, 2)
        self.assertEqual(lora_layer.lora_alpha, 4.0)
        self.assertEqual(lora_layer.lora_dropout.p, 0.1)
        
        # test dimension
        self.assertEqual(lora_layer.lora_A.in_features, 4)
        self.assertEqual(lora_layer.lora_A.out_features, 2)
        self.assertEqual(lora_layer.lora_B.in_features, 2)
        self.assertEqual(lora_layer.lora_B.out_features, 5)

        # test initialization
        ref_a = torch.tensor([[ 0.6689, -0.2572, -1.8404,  0.8856], 
                              [-0.1736, -0.2809, -0.6725, -0.4549]])
        self.assertTrue(torch.allclose(lora_layer.lora_A.weight.data, ref_a, atol=1e-4, rtol=1e-3))
        ref_b = torch.tensor([[0., 0.],
                              [0., 0.],
                              [0., 0.],
                              [0., 0.],
                              [0., 0.]])
        self.assertTrue(torch.allclose(lora_layer.lora_B.weight.data, ref_b, atol=1e-4, rtol=1e-3))

    def test_forward(self):
        seed = 1337
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        
        base_layer = nn.Linear(4, 5)
        lora_layer = MyLoraLayer(base_layer, r=2, lora_alpha=4, lora_dropout=0.1)
        lora_layer.lora_B.weight.data = torch.tensor([[0.142, 0.353], 
                                                      [0.471, 0.946],
                                                      [0.064, 0.745], 
                                                      [0.534, 0.271],
                                                      [0.912, 0.106]])
        lora_layer.eval()
        x = torch.tensor([[0.8938, -1.5608, -0.1650,  0.6615]])
        output = lora_layer(x)
        ref_o = torch.tensor([[0.5471, 2.3977, 0.8821, 1.4875, 2.6817]])
        self.assertTrue(torch.allclose(output, ref_o, atol=1e-4, rtol=1e-3))

unittest.main(argv=[''], exit=False)

## **2. Running Fine-tuning**

### 2.0 Preliminaries

In [None]:
seed = 1337
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

### 2.1 LoRA Fine Tuning

#### 2.1.1 Load pretrained model and weights

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5).to(device)

#### 2.1.2 Load LoRA adapter into model

Now that your LoRA implementation is complete, you will need to choose where to insert LoRA into your model. Any linear layer can in theory be augmented with a LoRA layer, but there is an intelligent choice you can make to maximize performance and efficiency.

Hint: check the [LoRA whitepaper](https://arxiv.org/pdf/2106.09685) section 4.2 and 5.1

In [None]:
# Show the model overview. You may find it useful to run this cell again after inserting LoRA
model

In [None]:
def get_lora_model(model, lora_config):
    modules_to_replace = {}

    for name, module in model.named_modules():
        if name.split('.')[-1] in lora_config.target_modules:
            modules_to_replace[name] = module

    for name, module in modules_to_replace.items():
        parent_module_name = '.'.join(name.split('.')[:-1])
        if parent_module_name:
            parent_module = model.get_submodule(parent_module_name)
        else:
            parent_module = model

        if isinstance(parent_module, nn.Sequential):
            index = list(parent_module.named_modules(remove_duplicate=False)).index((name, module))
            parent_module[index] = MyLoraLayer(module, lora_config.r, lora_config.lora_alpha, lora_config.lora_dropout)
        else:
            setattr(parent_module, name.split('.')[-1], MyLoraLayer(module, lora_config.r, lora_config.lora_alpha, lora_config.lora_dropout))

    # TODO: set only LoRA parameters and the last classifier layer to be trainable
    ...
    ...

    return model

In [None]:
lora_config = MyLoraConfig(
    r=16,
    target_modules=[...], # TODO
    lora_alpha=32,
    lora_dropout=0.4
)

model = get_lora_model(model, lora_config)

In [None]:
model

In [None]:
total_parameters_lora, trainable_parameters_lora, _ = get_model_params_info(model)

#### 2.1.3 Run LoRA Fine-tuning

In [None]:
learning_rate = 1e-4
num_epochs = 20
total_steps = len(train_dataloader) * num_epochs
loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_losses_lora = []
dev_accuracies_lora = []
f1_scores_lora = []

for epoch in range(num_epochs):
    dataloader = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

    model.train()
    loss = train_one_step(model, loss_fn, optimizer, dataloader, t='hf')
    print(f"Average training loss: {loss:.4f}")

    train_losses_lora.append(loss)

    model.eval()
    accuracy, f1 = eval_model(model, eval_dataloader, t='hf')
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")

    dev_accuracies_lora.append(accuracy)
    f1_scores_lora.append(f1)

### 2.2 Last Layer Fine Tuning.

#### 2.2.1 Load pretrained model and weights

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

#### 2.2.2 Set trainable parameters

In [None]:
# TODO: set only last classifier layer to be trainable
...

model = model.to(device)

In [None]:
total_parameters_llft, trainable_parameters_llft, _ = get_model_params_info(model)

#### 2.2.3 Run Last Layer Fine Tuning

In [None]:
learning_rate = 1e-4
num_epochs = 20
total_steps = len(train_dataloader) * num_epochs

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_losses_llft = []
dev_accuracies_llft = []
f1_scores_llft = []

for epoch in range(num_epochs):
    dataloader = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

    model.train()
    loss = train_one_step(model, loss_fn, optimizer, dataloader, t='hf')
    print(f"Average training loss: {loss:.4f}")

    train_losses_llft.append(loss)

    model.eval()
    accuracy, f1 = eval_model(model, eval_dataloader, t='hf')
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")

    dev_accuracies_llft.append(accuracy)
    f1_scores_llft.append(f1)

### 2.3 Full Fine Tuning

#### 2.3.1 Load pretrained model and weights

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5).to(device)

#### 2.3.2 Run Full Fine-tuning

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

total_parameters_fft, trainable_parameters_fft, _ = get_model_params_info(model)

In [None]:
# Adjust the batch size
train_dataloader = DataLoader(encoded_dataset['train'], batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(encoded_dataset['validation'], batch_size=batch_size)

learning_rate = 1e-4
num_epochs = 20
total_steps = len(train_dataloader) * num_epochs

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_losses_fft = []
dev_accuracies_fft = []
f1_scores_fft = []

for epoch in range(num_epochs):
    dataloader = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

    model.train()
    loss = train_one_step(model, loss_fn, optimizer, dataloader, t='hf')
    print(f"Average training loss: {loss:.4f}")

    train_losses_fft.append(loss)

    model.eval()
    accuracy, f1 = eval_model(model, eval_dataloader, t='hf')
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")

    dev_accuracies_fft.append(accuracy)
    f1_scores_fft.append(f1)

## **3. Visualize results**

You do not need to modify this part.

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 4))

def create_pie_chart(ax, method, total, trainable):
    sizes = [trainable, total - trainable]
    labels = [f'Trainable ({trainable})', f'Non-trainable ({total - trainable})']
    colors = ['skyblue', 'lightgray']

    wedges, texts, autotexts = ax.pie(sizes, labels=labels, colors=colors, autopct='%1.4f%%', startangle=90, shadow=True)

    for text in autotexts:
        text.set_fontsize(12)

    centre_circle = plt.Circle((0, 0), 0.70, fc='white')
    ax.add_artist(centre_circle)

    # Equal aspect ratio ensures that pie is drawn as a circle
    ax.axis('equal')
    ax.set_title(method, fontsize=14)

create_pie_chart(axs[0], 'Full Fine-tuning', total_parameters_fft, trainable_parameters_fft)
create_pie_chart(axs[1], 'Last Layer Fine-tuning', total_parameters_llft, trainable_parameters_llft)
create_pie_chart(axs[2], 'LoRA Fine-tuning', total_parameters_lora, trainable_parameters_lora)

plt.tight_layout()
plt.show()

In [None]:
epochs = range(1, num_epochs + 1)

plt.style.use('seaborn-v0_8')
plt.figure(figsize=(10, 4))
plt.plot(epochs, train_losses_fft, 'b-', label='Full Fine-tuning', linewidth=2, marker='o', markersize=5)
plt.plot(epochs, train_losses_llft, 'r-', label='Last Layer Fine-tuning', linewidth=2, marker='s', markersize=5)
plt.plot(epochs, train_losses_lora, 'g-', label='LoRA Fine-tuning', linewidth=2, marker='D', markersize=5)
plt.title('Training Loss over Epochs', fontsize=16)
plt.xlabel('Epochs', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True)
plt.show()

In [None]:
epochs = range(1, num_epochs + 1)

plt.style.use('seaborn-v0_8')
plt.figure(figsize=(10, 4))
plt.plot(epochs, dev_accuracies_fft, 'b-', label='Full Fine-tuning', linewidth=2, marker='o', markersize=5)
plt.plot(epochs, dev_accuracies_llft, 'r-', label='Last Layer Fine-tuning', linewidth=2, marker='s', markersize=5)
plt.plot(epochs, dev_accuracies_lora, 'g-', label='LoRA Fine-tuning', linewidth=2, marker='D', markersize=5)
plt.title('Validation/Development Accuracy over Epochs', fontsize=16)
plt.xlabel('Epochs', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True)
plt.show()

In [None]:
fine_tuning_methods = ['Full', 'Last Layer', 'LoRA']
dev_accuracy = [max(dev_accuracies_fft), max(dev_accuracies_llft), max(dev_accuracies_lora)]  # Development/Validation accuracy
test_accuracy = [max(f1_scores_fft), max(f1_scores_llft), max(f1_scores_lora)]  # Test accuracy

bar_width = 0.35
x = np.arange(len(fine_tuning_methods))

plt.figure(figsize=(8, 6))
bars1 = plt.bar(x - bar_width/2, dev_accuracy, bar_width, label='Peak Val Accuracy', color='skyblue')
bars2 = plt.bar(x + bar_width/2, test_accuracy, bar_width, label='Peak Val F1 Score', color='salmon')

plt.title('Comparison of Dev/Val and Test Accuracy for Fine-tuning Methods', fontsize=16)
plt.xlabel('Fine-tuning Methods', fontsize=12)
plt.ylabel('Performance', fontsize=12)
plt.xticks(x, fine_tuning_methods)
plt.ylim(0, 1)
plt.legend()

for bar in bars1:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02, f'{yval:.2f}', ha='center', va='bottom')

for bar in bars2:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02, f'{yval:.2f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()