# Finetune with LoRA

In [1]:
# pip install transformers datasets lightning watermark
%pip install watermark
%pip install lightning
%pip install torch
%pip install -q transformers[torch]
%pip install -q evaluate
%pip install -q peft
%pip install -q accelerate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
z

In [2]:
%load_ext watermark
%watermark --conda -p torch,transformers,datasets,lightning

  from .autonotebook import tqdm as notebook_tqdm


torch       : 2.6.0
transformers: 4.49.0
datasets    : 3.2.0
lightning   : 2.5.0.post0

conda environment: n/a



# 1 Loading the dataset into DataFrames

In [3]:
import os
from datasets import load_dataset

import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint

import pandas as pd
import torch

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
from local_dataset_utilities import IMDBDataset

In [4]:
if not torch.cuda.is_available():
    print("Please switch to a GPU machine before running this notebook.")


Please switch to a GPU machine before running this notebook.


In [5]:
files = ("test.csv", "train.csv", "val.csv")
download = True

for f in files:
    if not os.path.exists(os.path.join("data", f)):
        download = False

if download is False:
    download_dataset()
    df = load_dataset_into_to_dataframe()
    partition_dataset(df)

In [6]:
df_train = pd.read_csv(os.path.join("data", "train.csv"))
df_val = pd.read_csv(os.path.join("data", "val.csv"))
df_test = pd.read_csv(os.path.join("data", "test.csv"))

# 2 Tokenization and Numericalization

**Load the dataset via `load_dataset`**

In [7]:
imdb_dataset = load_dataset(
    "csv",
    data_files={
        "train": os.path.join("data", "train.csv"),
        "validation": os.path.join("data", "val.csv"),
        "test": os.path.join("data", "test.csv"),
    },
)

print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 10000
    })
})


**Tokenize the dataset**

In [8]:
from transformers import AutoTokenizer
# distilbert-base-uncased
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") # 512 30000
# tokenizer = AutoTokenizer.from_pretrained("01-ai/Yi-6B") # 4096 64000
# tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") # 2048 30000
# tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2") # 2048 50257
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

Tokenizer input max length: 512
Tokenizer vocabulary size: 30522


In [9]:
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

In [10]:
imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)

Map: 100%|██████████| 10000/10000 [00:02<00:00, 4300.49 examples/s]


In [None]:
del imdb_dataset

In [11]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://download.pytorch.org/whl/cpu

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [13]:
imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [14]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 3 Set Up DataLoaders

In [15]:
from torch.utils.data import DataLoader, Dataset


class IMDBDataset(Dataset):
    def __init__(self, dataset_dict, partition_key="train"):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]

    def __len__(self):
        return self.partition.num_rows

In [16]:
train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=12,
    shuffle=True, 
    num_workers=0
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=12,
    num_workers=0
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=12,
    num_workers=0
)

# 4 Initializing DistilBERT and Comparison with Traditional Fine-tuning

We will train a small BERT model for text classification since classification accuracy is easier to evaluate compared to text generation.

In [17]:
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print("Total number of trainable parameters:", count_parameters(model))

Total number of trainable parameters: 66955010


## **Freeze all layers**  

Since we only want to train the new LoRA weights, we freeze all model parameters by setting requires_grad=False for all trainable parameters.

In [19]:
for param in model.parameters():
    param.requires_grad = False

## **Add LoRA layers**  
Let's briefly examine the model's structure using print(model).

In [20]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


We can see that the model consists of six transformer blocks, each containing linear layers:

- (0-5): 6 x TransformerBlock

Additionally, the model has two linear output layers:
- pre_classifier: Linear(in_features=768, out_features=768, bias=True)
- classifier: Linear(in_features=768, out_features=2, bias=True)

We can selectively enable LoRA for these linear layers by defining an assignment function and iterating through the model layers.

In [41]:
class LoRALayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.W_a = torch.nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        self.W_b = torch.nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        x = self.alpha * (x @ self.W_a @ self.W_b)
        return x


class LinearWithLoRA(torch.nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        if not isinstance(linear, nn.Linear):
            raise TypeError(f"Expected nn.Linear, but got {type(linear)}")

        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )

    def forward(self, x):
        return self.linear(x) + self.lora(x)

In [None]:
from functools import partial
import torch.nn as nn

# LoRA 参数
lora_r = 8
lora_alpha = 16
lora_dropout = 0.05
lora_query = True
lora_key = False
lora_value = True
lora_projection = False
lora_mlp = False
lora_head = False

assign_lora = partial(LinearWithLoRA, rank=lora_r, alpha=lora_alpha)

for layer in model.distilbert.transformer.layer:
    print(f"Before LoRA - q_lin type: {type(layer.attention.q_lin)}")

    if lora_query and isinstance(layer.attention.q_lin, nn.Linear):  
        layer.attention.q_lin = assign_lora(layer.attention.q_lin) 

    if lora_key and isinstance(layer.attention.k_lin, nn.Linear):
        layer.attention.k_lin = assign_lora(layer.attention.k_lin)

    if lora_value and isinstance(layer.attention.v_lin, nn.Linear):
        layer.attention.v_lin = assign_lora(layer.attention.v_lin)

    if lora_projection and isinstance(layer.attention.out_lin, nn.Linear):
        layer.attention.out_lin = assign_lora(layer.attention.out_lin)

    if lora_mlp and isinstance(layer.ffn.lin1, nn.Linear):
        layer.ffn.lin1 = assign_lora(layer.ffn.lin1)

    if lora_mlp and isinstance(layer.ffn.lin2, nn.Linear):
        layer.ffn.lin2 = assign_lora(layer.ffn.lin2)

    print(f"After LoRA - q_lin type: {type(layer.attention.q_lin)}")

if lora_head:
    if isinstance(model.pre_classifier, nn.Linear):
        model.pre_classifier = assign_lora(model.pre_classifier)
    if isinstance(model.classifier, nn.Linear):
        model.classifier = assign_lora(model.classifier)


Before LoRA - q_lin type: <class '__main__.LinearWithLoRA'>
After LoRA - q_lin type: <class '__main__.LinearWithLoRA'>
Before LoRA - q_lin type: <class '__main__.LinearWithLoRA'>
After LoRA - q_lin type: <class '__main__.LinearWithLoRA'>
Before LoRA - q_lin type: <class '__main__.LinearWithLoRA'>
After LoRA - q_lin type: <class '__main__.LinearWithLoRA'>
Before LoRA - q_lin type: <class '__main__.LinearWithLoRA'>
After LoRA - q_lin type: <class '__main__.LinearWithLoRA'>
Before LoRA - q_lin type: <class '__main__.LinearWithLoRA'>
After LoRA - q_lin type: <class '__main__.LinearWithLoRA'>
Before LoRA - q_lin type: <class '__main__.LinearWithLoRA'>
After LoRA - q_lin type: <class '__main__.LinearWithLoRA'>


After applying LoRA, we can use print(model) again to check the updated structure.

In [49]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_lin): Linear(in_features=768, out_features=768, bia

As observed above, the linear layers have been successfully replaced with LinearWithLoRA layers.

In [50]:
# Check if linear layers are frozen
for name, param in model.named_parameters():
    print(f"{name}: {param.requires_grad}")

distilbert.embeddings.word_embeddings.weight: False
distilbert.embeddings.position_embeddings.weight: False
distilbert.embeddings.LayerNorm.weight: False
distilbert.embeddings.LayerNorm.bias: False
distilbert.transformer.layer.0.attention.q_lin.linear.weight: False
distilbert.transformer.layer.0.attention.q_lin.linear.bias: False
distilbert.transformer.layer.0.attention.q_lin.lora.W_a: True
distilbert.transformer.layer.0.attention.q_lin.lora.W_b: True
distilbert.transformer.layer.0.attention.k_lin.weight: False
distilbert.transformer.layer.0.attention.k_lin.bias: False
distilbert.transformer.layer.0.attention.v_lin.linear.weight: False
distilbert.transformer.layer.0.attention.v_lin.linear.bias: False
distilbert.transformer.layer.0.attention.v_lin.lora.W_a: True
distilbert.transformer.layer.0.attention.v_lin.lora.W_b: True
distilbert.transformer.layer.0.attention.out_lin.weight: False
distilbert.transformer.layer.0.attention.out_lin.bias: False
distilbert.transformer.layer.0.sa_layer_no

With LoRA, we only need to train 147,456 parameters, significantly reducing the number of trainable parameters compared to traditional fine-tuning.

In [51]:
print("Total number of trainable parameters:", count_parameters(model))

Total number of trainable parameters: 147456


# 5 Finetuning

**Wrap in LightningModule for Training**

In [52]:
from local_model_utilities import CustomLightningModule

lightning_model = CustomLightningModule(model)

In [53]:
callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode="max", monitor="val_acc"
    )  # save top 1 model
]
logger = CSVLogger(save_dir="logs/", name="my-model")

In [54]:
trainer = L.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    accelerator="gpu",
    precision="16-mixed",
    devices=1,
    logger=logger,
    log_every_n_steps=10,
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [55]:
import time
start = time.time()

trainer.fit(model=lightning_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

end = time.time()
elapsed = end - start
print(f"Time elapsed {elapsed/60:.2f} min")


  | Name     | Type                                | Params | Mode 
-------------------------------------------------------------------------
0 | model    | DistilBertForSequenceClassification | 67.1 M | eval 
1 | val_acc  | MulticlassAccuracy                  | 0      | train
2 | test_acc | MulticlassAccuracy                  | 0      | train
-------------------------------------------------------------------------
147 K     Trainable params
67.0 M    Non-trainable params
67.1 M    Total params
268.410   Total estimated model params size (MB)
26        Modules in train mode
96        Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/opt/homebrew/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


                                                                           

/opt/homebrew/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Epoch 2: 100%|██████████| 2917/2917 [26:50<00:00,  1.81it/s, v_num=1, val_loss=0.251, val_acc=0.900]

RuntimeError: MPS backend out of memory (MPS allocated: 4.02 GB, other allocations: 32.14 GB, max allowed: 36.27 GB). Tried to allocate 144.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
print(f"Train acc: {train_acc[0]['accuracy']*100:2.2f}%")
print(f"Val acc:   {val_acc[0]['accuracy']*100:2.2f}%")
print(f"Test acc:  {test_acc[0]['accuracy']*100:2.2f}%")

In [None]:
import shutil

# Cleanup checkpoint files as we don't need them later
log_dir = f"logs/my-model"
if os.path.exists(log_dir):
    shutil.rmtree(log_dir)