In [None]:
'''
pip install -U pip setuptools wheel
pip install -U torch transformers datasets bitsandbytes accelerate peft[torch] evaluate sentencepiece wandb
pip install huggingface_hub[hf_xet]
'''

In [None]:
# %%
# BERT QLoRA + LoRA fine-tuning on IMDB sentiment dataset (dipanjanS/imdb_sentiment_finetune_dataset20k)
# Ready-to-run notebook-style script. Run cells sequentially in Jupyter / VSCode interactive.


# %%
# 1) Install required packages (uncomment if running in a fresh environment)
# Note: If you already have these installed, you can skip this cell.


# !pip install -q transformers==4.35.0 datasets bitsandbytes accelerate peft[torch] wandb evaluate sentencepiece


# %%
# 2) Imports and basic checks
import os
import math
import time
from pprint import pprint


import torch
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
DataCollatorWithPadding,
BitsAndBytesConfig,
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, get_peft_model_state_dict
import evaluate

os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [None]:
# Check GPU
print('Torch version:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('CUDA device count:', torch.cuda.device_count())
    print('Current device:', torch.cuda.current_device())
    print('Device name:', torch.cuda.get_device_name(torch.cuda.current_device()))

In [None]:
# %%
# 3) Configs (edit these if you want)
MODEL_NAME = "bert-base-uncased"
DATASET = "dipanjanS/imdb_sentiment_finetune_dataset20k"
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-4
OUTPUT_DIR = "./bert_qlora_imdb_output"
SEED = 42


os.makedirs(OUTPUT_DIR, exist_ok=True)


# Reproducibility
torch.manual_seed(SEED)


# %%
# 4) Load dataset
raw_ds = load_dataset(DATASET)
print(raw_ds)

In [None]:
# The dataset should have a train split; if not, adapt accordingly.
# We'll combine and then split to have a clean train/test split (80/20)
if 'train' in raw_ds and len(raw_ds) == 1:
    ds = raw_ds['train']
else:
    # if dataset already has train/test, we'll concatenate whatever splits exist
    from datasets import concatenate_datasets
    allsplits = [raw_ds[s] for s in raw_ds]
    ds = concatenate_datasets(allsplits)


print('Total examples:', len(ds))

In [None]:
# %%
# 5) Train/test split
split = ds.train_test_split(test_size=0.2, seed=SEED)
train_ds = split['train']
test_ds = split['test']
print('Train size:', len(train_ds), 'Test size:', len(test_ds))

In [None]:
# %%
# 6) Tokenizer and preprocessing
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)


# ensure tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})


def preprocess_function(examples):
    return tokenizer(
    examples['review'],
    truncation=True,
    padding='max_length',
    max_length=MAX_LENGTH,
    )


train_tok = train_ds.map(preprocess_function, batched=True, remove_columns=[c for c in train_ds.column_names if c != 'sentiment' and c != 'review'])
test_tok = test_ds.map(preprocess_function, batched=True, remove_columns=[c for c in test_ds.column_names if c != 'sentiment' and c != 'review'])


# Rename label column
train_tok = train_tok.rename_column('sentiment', 'labels')
test_tok = test_tok.rename_column('sentiment', 'labels')


# Set format to PyTorch
train_tok.set_format(type='torch', columns=[c for c in train_tok.column_names if c != 'review'])
test_tok.set_format(type='torch', columns=[c for c in test_tok.column_names if c != 'review'])


print(train_tok[0])

In [None]:
# %%
# 7) Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


# %%
# 8) Prepare quantization config (BitsAndBytes) for 4-bit QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    )

In [None]:
# %%
# 9) Load the model in 4-bit mode and prepare for k-bit training
print('\nLoading model in 4-bit mode (this may take a while)...')
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    quantization_config=bnb_config,
    device_map="auto",
    )


# Important: for some older HF versions use load_in_4bit=True directly (kept as quantization_config for clarity)


# resize token embeddings if tokenizer changed
model.resize_token_embeddings(len(tokenizer))

In [None]:
# %%
# 10) Prepare model for k-bit training (patching some layers for stability)
model = prepare_model_for_kbit_training(model)


# %%
# 11) LoRA config & applying PEFT
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "key", "value"], # BERT attention modules
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
    )

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
# %%
# 12) Utility to count parameters
def count_parameters(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total, trainable


total_params, trainable_params = count_parameters(model)
print('Total params:', total_params)
print('Trainable params (after LoRA):', trainable_params)
print('Trainable fraction: {:.6f}'.format(trainable_params/total_params))


# Save the numbers to disk for reporting later
with open(os.path.join(OUTPUT_DIR, 'param_counts.txt'), 'w') as f:
    f.write(f"total={total_params}\ntrainable={trainable_params}\n")

In [None]:
# %%
# 13) Metrics and compute_metrics function
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy.compute(predictions=preds, references=labels)
    p = precision.compute(predictions=preds, references=labels, average='binary')
    r = recall.compute(predictions=preds, references=labels, average='binary')
    f_1 = f1.compute(predictions=preds, references=labels, average='binary')
    return {'accuracy': acc['accuracy'], 'precision': p['precision'], 'recall': r['recall'], 'f1': f_1['f1']}

In [None]:
# %%
# 14) Training arguments and Trainer setup
# training_args = TrainingArguments(
#     output_dir=OUTPUT_DIR,
#     num_train_epochs=EPOCHS,
#     per_device_train_batch_size=BATCH_SIZE,
#     per_device_eval_batch_size=BATCH_SIZE,
#     evaluation_strategy="epoch",   # <-- run eval every epoch
#     save_strategy="epoch",
#     logging_strategy='steps',
#     logging_steps=50,
#     learning_rate=LEARNING_RATE,
#     weight_decay=0.01,
#     fp16=True,
#     gradient_accumulation_steps=1,
#     load_best_model_at_end=True,
#     metric_for_best_model='accuracy',
#     greater_is_better=True,
#     report_to=["wandb"], # requires W&B login in the environment; remove if not using W&B
#     )

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_strategy='steps',
    logging_steps=50,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=1,
    seed=SEED,
    metric_for_best_model='accuracy'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )

In [None]:
# %%
# 15) (Optional) Initialize Weights & Biases if you want to log there; otherwise HF Trainer will still log locally
# import wandb
# wandb.init(project='bert-qlora-imdb', name='bert-qlora-run')


# %%
# 16) Report baseline GPU memory BEFORE training
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    before_mem = torch.cuda.memory_allocated()
    print('GPU memory allocated before training (bytes):', before_mem)
else:
    print('No CUDA device; memory measurements will be skipped')

In [None]:
# %%
# 17) Train
start = time.time()
trainer.train()
end = time.time()
print('Training time (s):', end-start)

In [None]:
# %%
# 18) GPU memory AFTER training
if torch.cuda.is_available():
    after_mem = torch.cuda.memory_allocated()
    print('GPU memory allocated after training (bytes):', after_mem)
    print('Delta (bytes):', after_mem - before_mem)

In [None]:
# %%
# 19) Save PEFT/LoRA adapters and model
trainer.save_model(os.path.join(OUTPUT_DIR, 'qlora_lora_model'))
# Save the PEFT adapter separately
model.save_pretrained(os.path.join(OUTPUT_DIR, 'qlora_lora_adapter'))

In [None]:
# %%
# 20) Evaluate on the test set
metrics = trainer.evaluate(eval_dataset=test_tok)
print('Eval metrics:', metrics)

In [None]:
# %%
# 21) Demonstrate trainable parameter counts before and after applying LoRA
# NOTE: If you want to see the model parameter counts BEFORE adding LoRA, you would need to load the same model
# without applying get_peft_model. For convenience we show how to do it (commented) and show counts for current model.


# Uncomment to compute counts for a non-PEFT baseline (may be heavy if loaded in full precision):
# baseline_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
# baseline_total, baseline_trainable = sum(p.numel() for p in baseline_model.parameters()), sum(p.numel() for p in baseline_model.parameters() if p.requires_grad)
# print('Baseline total params:', baseline_total, 'Baseline trainable:', baseline_trainable)


print('Current total params:', total_params)
print('Current trainable params (LoRA adapters):', trainable_params)

In [None]:
# %%
# 22) Memory efficiency comparison between full fine-tuning and QLoRA
# Provide guidance and a small programmatic check when possible. If you can load full model in the environment, uncomment.


# NOTE: Full fine-tuning would require loading the full model in fp16 or fp32 and then fine-tuning which might not be possible on small GPUs.
# The recommended approach: run the baseline on a machine with enough memory or use the HF Hub to retrieve known model sizes.


# Example code to (optionally) measure full model memory usage (commented):
# if torch.cuda.is_available():
# torch.cuda.empty_cache()
# t0 = torch.cuda.memory_allocated()
# full = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to('cuda')
# t1 = torch.cuda.memory_allocated()
# print('Full model memory delta (bytes):', t1-t0)

In [None]:
# %%
# 23) Summary report generation (basic)
summary = {
'model_name': MODEL_NAME,
'dataset': DATASET,
'max_length': MAX_LENGTH,
'batch_size': BATCH_SIZE,
'epochs': EPOCHS,
'learning_rate': LEARNING_RATE,
'total_params': total_params,
'trainable_params': trainable_params,
'eval_metrics': metrics,
}


pprint(summary)


with open(os.path.join(OUTPUT_DIR, 'summary.txt'), 'w') as f:
    f.write(str(summary))

### Next Steps

In [None]:
# %%
# 24) Next steps / further experiments (printed for convenience)
print('\nNext steps you can run:')
print('- Run a full fine-tuning baseline (non-quantized) to compare accuracy and memory (requires GPU memory)')
print('- Try different LoRA ranks (r) and lora_alpha to find a better accuracy/memory sweet spot')
print('- Use smaller MAX_LENGTH or smaller batch size to fit in lower-memory GPUs')
print('- Use HF Trainer callbacks or accelerate for distributed training')


# End of notebook