In [2]:
# pip install -U pip setuptools wheel
# pip install -U torch transformers datasets bitsandbytes accelerate peft[torch] evaluate sentencepiece
# pip install huggingface_hub[hf_xet]
# pip install hf_transfer

# BERT QLoRA + LoRA fine-tuning on IMDB sentiment dataset 
(dipanjanS/imdb_sentiment_finetune_dataset20k)

In [2]:

import os
import math
import time
from pprint import pprint


import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, get_peft_model_state_dict
import evaluate

os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

This experiment was performed on the following Runpod instance:
<br>1x A40 (48 GB VRAM)
<br>50 GB RAM • 9 vCPU
<br>Total Disk: 80 GB

In [3]:
print('Torch version:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('CUDA device count:', torch.cuda.device_count())
    print('Current device:', torch.cuda.current_device())
    print('Device name:', torch.cuda.get_device_name(torch.cuda.current_device()))

Torch version: 2.8.0+cu128
CUDA available: True
CUDA device count: 1
Current device: 0
Device name: NVIDIA A40


In [30]:
MODEL_NAME = "bert-base-uncased"
DATASET = "dipanjanS/imdb_sentiment_finetune_dataset20k"
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 2 # First iteration was run with 3 epocs and then repeated with 2 epocs for the current final outputs in this notebook
LEARNING_RATE = 2e-4
OUTPUT_DIR = "./bert_qlora_imdb_output"
SEED = 42

os.makedirs(OUTPUT_DIR, exist_ok=True)
torch.manual_seed(SEED) # Congiguring this for reproducibility
raw_ds = load_dataset(DATASET)
print(raw_ds)

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 10000
    })
})


In [5]:
if 'train' in raw_ds and len(raw_ds) == 1:
    ds = raw_ds['train']
else:
    from datasets import concatenate_datasets
    allsplits = [raw_ds[s] for s in raw_ds]
    ds = concatenate_datasets(allsplits)


print('Total examples:', len(ds))

Total examples: 20000


In [6]:
# Going for the 80/20 train/test split
split = ds.train_test_split(test_size=0.2, seed=SEED)
train_ds = split['train']
test_ds = split['test']
print('Train size:', len(train_ds), 'Test size:', len(test_ds))

Train size: 16000 Test size: 4000


In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def preprocess_function(examples):
    return tokenizer(
    examples['review'],
    truncation=True,
    padding='max_length',
    max_length=MAX_LENGTH,
    )

# Just incase pre-procressing
train_tok = train_ds.map(preprocess_function, batched=True, remove_columns=[c for c in train_ds.column_names if c != 'sentiment' and c != 'review'])
test_tok = test_ds.map(preprocess_function, batched=True, remove_columns=[c for c in test_ds.column_names if c != 'sentiment' and c != 'review'])
train_tok = train_tok.rename_column('sentiment', 'labels')
test_tok = test_tok.rename_column('sentiment', 'labels')

# Set format to PyTorch
train_tok.set_format(type='torch', columns=[c for c in train_tok.column_names if c != 'review'])
test_tok.set_format(type='torch', columns=[c for c in test_tok.column_names if c != 'review'])

print(train_tok[0])

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

{'labels': tensor(1), 'input_ids': tensor([  101,  2157,  1996,  2902,  3496,  2007,  9079,  1998, 10881,  2001,
         2589,  8235,  2135,  2009,  4627,  2125,  2007, 11939,  2006,  1037,
        25061,  2559,  2200,  6649,  5229,  1010,  1996,  9986,  2015,  2679,
         2014,  2046,  1037, 24501,  2271, 26243,  3370,  2282,  1004,  2027,
         2693,  2014,  2013,  1996, 25061,  3031,  1037,  2793,  1998, 10975,
         5657,  4324,  2014,  2192,  2013,  2008,  2391,  2006,  2009,  2003,
         5793,  2008, 11939,  2003,  2383,  1037,  2843,  1997,  4390,  5505,
         1998,  2014,  8948,  2024,  7989,  1010,  2004,  2016,  4332,  2000,
        11693,  1997, 10975,  5657,  2000,  2025,  2681,  2014,  2217,  2016,
        16680,  1000,  2123,  1005,  1056,  2175,  1045,  2293,  2017,  1998,
         2059,  2014,  8187,  9010,  1998,  2016,  3632,  2046, 15050,  6545,
         1004,  1996,  8080,  3065,  1037,  3154,  4257,  2240,  1004,  1996,
        11500,  2175,  2046, 

# Applying LORA and QLORA Configurations

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Since this is an experiment, performing 4 bit quantization for lowest cost
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    )

In [9]:
# Loadint the model in 4-bit mode
print('\nLoading model in 4-bit mode (this may take a while)...')
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    quantization_config=bnb_config,
    device_map="auto",
    )

# resize token embeddings if tokenizer changed
model.resize_token_embeddings(len(tokenizer))


Loading model in 4-bit mode (this may take a while)...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(30522, 768, padding_idx=0)

In [10]:
# Prepare model for k-bit training (patching some layers for stability)
model = prepare_model_for_kbit_training(model)

# LoRA config & applying PEFT
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "key", "value"], # BERT attention modules
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
    )

In [11]:
model = get_peft_model(model, lora_config)

## Comparing the parameters

In [12]:
def count_parameters(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total, trainable


total_params, trainable_params = count_parameters(model)
print('Total params:', total_params)
print('Trainable params (after LoRA):', trainable_params)
print('Trainable fraction: {:.6f}'.format(trainable_params/total_params))


# Saving incase instance shut down
with open(os.path.join(OUTPUT_DIR, 'param_counts.txt'), 'w') as f:
    f.write(f"total={total_params}\ntrainable={trainable_params}\n")

Total params: 67607812
Trainable params (after LoRA): 886274
Trainable fraction: 0.013109


## Setting up evaluation and compute metrics

In [14]:
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy.compute(predictions=preds, references=labels)
    p = precision.compute(predictions=preds, references=labels, average='binary')
    r = recall.compute(predictions=preds, references=labels, average='binary')
    f_1 = f1.compute(predictions=preds, references=labels, average='binary')
    return {'accuracy': acc['accuracy'], 'precision': p['precision'], 'recall': r['recall'], 'f1': f_1['f1']}

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [31]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_strategy='steps',
    logging_steps=50,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    fp16=True,
    report_to=["none"],
    gradient_accumulation_steps=1,
    seed=SEED,
    metric_for_best_model='accuracy'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )

  trainer = Trainer(


In [16]:
# This code execution will not involve weights and biases logging - however I understand how API KEY can be obtained and configurations and be added to log 
# import wandb
# wandb.init(project='bert-qlora-imdb', name='bert-qlora-run')

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    before_mem = torch.cuda.memory_allocated()
    print('GPU memory allocated before training (bytes):', before_mem)
else:
    print('No CUDA device; memory measurements will be skipped')

GPU memory allocated before training (bytes): 144471552


In [23]:
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline" 

In [32]:
start = time.time()
trainer.train()
end = time.time()
print('Training time (s):', end-start)

  return fn(*args, **kwargs)


Step,Training Loss
50,0.233
100,0.2155
150,0.2591
200,0.2093
250,0.2136
300,0.1724
350,0.1484
400,0.1727
450,0.1678
500,0.1814


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Training time (s): 278.9815013408661


## Delta memory - memory allocated before training - memory allocated after training

In [33]:
if torch.cuda.is_available():
    after_mem = torch.cuda.memory_allocated()
    print('GPU memory allocated after training (bytes):', after_mem)
    print('Delta (bytes):', after_mem - before_mem)

GPU memory allocated after training (bytes): 175854080
Delta (bytes): 31382528


## Saving the model!

In [34]:
trainer.save_model(os.path.join(OUTPUT_DIR, 'qlora_lora_model'))
# Save the PEFT adapter separately
model.save_pretrained(os.path.join(OUTPUT_DIR, 'qlora_lora_adapter'))

## Executing Evaluation

In [35]:
metrics = trainer.evaluate(eval_dataset=test_tok)
print('Eval metrics:', metrics)

Eval metrics: {'eval_loss': 0.3117099702358246, 'eval_accuracy': 0.909, 'eval_precision': 0.8947368421052632, 'eval_recall': 0.9238191975622143, 'eval_f1': 0.9090454772613693, 'eval_runtime': 9.5187, 'eval_samples_per_second': 420.224, 'eval_steps_per_second': 26.264, 'epoch': 2.0}


In [36]:
print('Current total params:', total_params)
print('Current trainable params (LoRA adapters):', trainable_params)

Current total params: 67607812
Current trainable params (LoRA adapters): 886274


### Additonal comments
Logging in weights and biases requires additional API Keys and configuration.
To compare benefits of QLora to full finetuning - I would have to run full fine tuning in RunPod for same numeber of EPOCs and compare memory allocation, trainable parameters and performance evaluation on same test data. I understand the process, however, it is not executed in this notebook to converve costs.

## Overall summary

In [37]:
summary = {
'model_name': MODEL_NAME,
'dataset': DATASET,
'max_length': MAX_LENGTH,
'batch_size': BATCH_SIZE,
'epochs': EPOCHS,
'learning_rate': LEARNING_RATE,
'total_params': total_params,
'trainable_params': trainable_params,
'eval_metrics': metrics,
}


pprint(summary)

# Saving incase instance shut down
with open(os.path.join(OUTPUT_DIR, 'summary.txt'), 'w') as f:
    f.write(str(summary))

{'batch_size': 16,
 'dataset': 'dipanjanS/imdb_sentiment_finetune_dataset20k',
 'epochs': 2,
 'eval_metrics': {'epoch': 2.0,
                  'eval_accuracy': 0.909,
                  'eval_f1': 0.9090454772613693,
                  'eval_loss': 0.3117099702358246,
                  'eval_precision': 0.8947368421052632,
                  'eval_recall': 0.9238191975622143,
                  'eval_runtime': 9.5187,
                  'eval_samples_per_second': 420.224,
                  'eval_steps_per_second': 26.264},
 'learning_rate': 0.0002,
 'max_length': 256,
 'model_name': 'bert-base-uncased',
 'total_params': 67607812,
 'trainable_params': 886274}
