# LoRA Fine-tuning Benchmark: DistilBERT on SST-2

This notebook benchmarks LoRA adaptation methods for fine-tuning DistilBERT on the SST-2 sentiment classification task.

**Configuration:**
- Base Model: DistilBERT
- Dataset: SST-2 (Stanford Sentiment Treebank)
- Split Ratio: Train:Val:Test = 8:1:1
- LoRA Ranks: [2, 4, 8, 16]
- Training Epochs: 10
- Random Seed: 42

**Metrics Tracked:**
- Final Accuracy
- Running time per epoch
- Total training time
- Time to convergence
- GPU memory used
- Total parameters
- Trainable parameters
- Convergence epoch

In [1]:
# Import necessary libraries
import torch
import numpy as np
import random
import time
import json
import warnings
import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.metrics import accuracy_score
import gc

# Filter out specific warnings
warnings.filterwarnings('ignore', category=UserWarning, module='torch.nn.parallel._functions')

# Helper function to check if this is the main process (for multi-GPU training)
def is_main_process():
    # Check LOCAL_RANK environment variable
    # In distributed training: rank 0 is main process
    # In single process or DataParallel: LOCAL_RANK not set, return True
    local_rank = os.environ.get("LOCAL_RANK", None)
    if local_rank is None:
        return True
    return int(local_rank) == 0

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if is_main_process():
    print(f"Using device: {device}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"Initial GPU Memory: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
GPU: Tesla P100-PCIE-16GB
Initial GPU Memory: 0.00 MB


In [2]:
# Set random seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
print("Random seed set to 42")

Random seed set to 42


In [3]:
# Load and prepare SST-2 dataset
print("Loading SST-2 dataset...")
dataset = load_dataset("glue", "sst2")

# Get full training data
train_data = dataset["train"]
total_samples = len(train_data)

# Calculate split sizes (8:1:1)
train_size = int(0.8 * total_samples)
val_size = int(0.1 * total_samples)
test_size = total_samples - train_size - val_size

# Split the dataset
train_dataset = train_data.select(range(train_size))
val_dataset = train_data.select(range(train_size, train_size + val_size))
test_dataset = train_data.select(range(train_size + val_size, total_samples))

print(f"Dataset split complete:")
print(f"  Train: {len(train_dataset)} samples")
print(f"  Validation: {len(val_dataset)} samples")
print(f"  Test: {len(test_dataset)} samples")

Loading SST-2 dataset...
Dataset split complete:
  Train: 53879 samples
  Validation: 6734 samples
  Test: 6736 samples
Dataset split complete:
  Train: 53879 samples
  Validation: 6734 samples
  Test: 6736 samples


In [4]:
# Tokenize the dataset
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

print("Tokenizing datasets...")
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

print("Tokenization complete!")

Loading tokenizer...
Tokenizing datasets...
Tokenization complete!
Tokenizing datasets...
Tokenization complete!


In [5]:
# Define compute metrics function
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="binary")
    }

In [6]:
# Custom Trainer to track metrics per epoch
from transformers.trainer_callback import TrainerCallback

class MetricsCallback(TrainerCallback):
    def __init__(self):
        self.epoch_times = []
        self.epoch_accuracies = []
        self.epoch_f1s = []
        self.epoch_start_time = None
        self.best_f1 = 0.0
        self.best_f1_epoch = None
        self.logged_epochs = set()  # Track which epochs have been logged
        
    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start_time = time.time()
        
    def on_epoch_end(self, args, state, control, **kwargs):
        epoch_time = time.time() - self.epoch_start_time
        self.epoch_times.append(epoch_time)
        
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        # This is called after evaluation
        if 'eval_accuracy' in metrics and state.epoch > 0:
            accuracy = metrics['eval_accuracy']
            f1 = metrics.get('eval_f1', 0.0)
            current_epoch = int(state.epoch)
            
            # Only log data once per epoch (no printing to avoid duplicates)
            if current_epoch not in self.logged_epochs:
                self.logged_epochs.add(current_epoch)
                self.epoch_accuracies.append(accuracy)
                self.epoch_f1s.append(f1)
                
                # Track best F1 epoch for convergence
                if f1 > self.best_f1:
                    self.best_f1 = f1
                    self.best_f1_epoch = current_epoch

In [7]:
# Function to train and evaluate LoRA model with a specific rank
def train_lora_model(rank, epochs=10):
    print(f"\n{'='*80}")
    print(f"Training LoRA model with rank = {rank}")
    print(f"{'='*80}\n")
    
    # Reset seed for each run
    set_seed(42)
    
    # Clear GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
    
    # Load base model
    print("Loading base DistilBERT model...")
    base_model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=2
    )
    
    # Configure LoRA
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=rank,
        lora_alpha=rank * 2,  # Common practice: alpha = 2 * r
        lora_dropout=0.1,
        target_modules=["q_lin", "v_lin"],  # DistilBERT attention modules
    )
    
    # Apply LoRA to the model
    model = get_peft_model(base_model, lora_config)
    
    # Print model parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"\nModel Parameters:")
    print(f"  Total parameters: {total_params:,}")
    print(f"  Trainable parameters: {trainable_params:,}")
    print(f"  Trainable %: {100 * trainable_params / total_params:.2f}%\n")
    
    # Move model to device
    model.to(device)
    
    # Record GPU memory before training (print once using is_main_process)
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
        initial_memory = torch.cuda.memory_allocated() / 1024**2
        if is_main_process():
            print(f"GPU Memory before training: {initial_memory:.2f} MB")
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_rank_{rank}",
        num_train_epochs=epochs,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_steps=100,
        eval_strategy="epoch",  # Changed from evaluation_strategy
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=1,
        seed=42,
        report_to="none",  # Disable wandb/tensorboard
        disable_tqdm=False,  # Keep progress bar
    )
    
    # Create metrics callback
    metrics_callback = MetricsCallback()
    
    # Create trainer with callback
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics,
        callbacks=[metrics_callback],
    )
    
    # Print starting message
    if is_main_process():
        print("Starting training...")
    
    start_time = time.time()
    trainer.train()
    total_training_time = time.time() - start_time
    
    # Get peak GPU memory
    if torch.cuda.is_available():
        peak_memory = torch.cuda.max_memory_allocated() / 1024**2
        print(f"\nPeak GPU Memory during training: {peak_memory:.2f} MB")
    else:
        peak_memory = 0
    
    # Evaluate on test set
    print("\nEvaluating on test set...")
    test_results = trainer.evaluate(tokenized_test)
    test_accuracy = test_results['eval_accuracy']
    test_f1 = test_results.get('eval_f1', 0.0)
    
    # Calculate time to convergence (to best F1 epoch)
    if metrics_callback.best_f1_epoch is not None:
        convergence_epoch = metrics_callback.best_f1_epoch
        time_to_convergence = sum(metrics_callback.epoch_times[:int(convergence_epoch)])
    else:
        convergence_epoch = epochs
        time_to_convergence = total_training_time
    
    # Get final validation accuracy and F1
    if len(metrics_callback.epoch_accuracies) > 0:
        final_val_accuracy = metrics_callback.epoch_accuracies[-1]
        final_val_f1 = metrics_callback.epoch_f1s[-1]
    else:
        # If epoch_accuracies is empty, evaluate manually
        val_results = trainer.evaluate(tokenized_val)
        final_val_accuracy = val_results['eval_accuracy']
        final_val_f1 = val_results.get('eval_f1', 0.0)
    
    # Compile results
    results = {
        "rank": rank,
        "final_test_accuracy": test_accuracy,
        "final_test_f1": test_f1,
        "final_val_accuracy": final_val_accuracy,
        "final_val_f1": final_val_f1,
        "total_parameters": total_params,
        "trainable_parameters": trainable_params,
        "trainable_percentage": 100 * trainable_params / total_params,
        "total_training_time": total_training_time,
        "average_epoch_time": np.mean(metrics_callback.epoch_times) if len(metrics_callback.epoch_times) > 0 else 0,
        "epoch_times": metrics_callback.epoch_times,
        "epoch_accuracies": metrics_callback.epoch_accuracies,
        "epoch_f1s": metrics_callback.epoch_f1s,
        "convergence_epoch": convergence_epoch,
        "time_to_convergence": time_to_convergence,
        "peak_gpu_memory_mb": peak_memory,
    }
    
    print(f"\n{'='*80}")
    print(f"Results for rank = {rank}:")
    print(f"  Test Accuracy: {test_accuracy:.4f}")
    print(f"  Test F1: {test_f1:.4f}")
    print(f"  Validation Accuracy: {final_val_accuracy:.4f}")
    print(f"  Validation F1: {final_val_f1:.4f}")
    print(f"  Total Training Time: {total_training_time:.2f}s")
    print(f"  Average Epoch Time: {results['average_epoch_time']:.2f}s")
    print(f"  Convergence Epoch (Best F1): {convergence_epoch}")
    print(f"  Time to Convergence: {time_to_convergence:.2f}s")
    print(f"  Peak GPU Memory: {peak_memory:.2f} MB")
    print(f"{'='*80}\n")
    
    # Clean up
    del model
    del trainer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    
    return results

In [8]:
# Run benchmarks for all ranks
ranks_to_test = [2, 4, 8, 16]
all_results = []

print("Starting LoRA benchmark experiments...")
print(f"Testing ranks: {ranks_to_test}")
print(f"Training epochs: 10")
print(f"Random seed: 42\n")

for rank in ranks_to_test:
    results = train_lora_model(rank, epochs=10)
    all_results.append(results)
    
    # Save individual result
    with open(f"lora_rank_{rank}_results.json", "w") as f:
        json.dump(results, f, indent=2)

print("\n" + "="*80)
print("All experiments completed!")
print("="*80)

Starting LoRA benchmark experiments...
Testing ranks: [2, 4, 8, 16]
Training epochs: 10
Random seed: 42


Training LoRA model with rank = 2

Loading base DistilBERT model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Parameters:
  Total parameters: 67,584,004
  Trainable parameters: 628,994
  Trainable %: 0.93%

GPU Memory before training: 258.90 MB
Starting training...
GPU Memory before training: 258.90 MB
Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3387,0.315919,0.865904,0.876386
2,0.3134,0.301255,0.870805,0.881954
3,0.3131,0.295097,0.87526,0.887248
4,0.2675,0.286099,0.878824,0.889371
5,0.2893,0.282379,0.881645,0.89237
6,0.266,0.279483,0.882388,0.892712
7,0.2592,0.277525,0.883724,0.894744
8,0.2613,0.276058,0.884912,0.895538
9,0.2469,0.274709,0.884467,0.894092
10,0.241,0.274646,0.885209,0.895357



Peak GPU Memory during training: 808.04 MB

Evaluating on test set...



Results for rank = 2:
  Test Accuracy: 0.8897
  Test F1: 0.9015
  Validation Accuracy: 0.8852
  Validation F1: 0.8954
  Total Training Time: 2422.85s
  Average Epoch Time: 228.30s
  Convergence Epoch (Best F1): 8
  Time to Convergence: 1827.12s
  Peak GPU Memory: 808.04 MB


Training LoRA model with rank = 4


Training LoRA model with rank = 4



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading base DistilBERT model...

Model Parameters:
  Total parameters: 67,620,868
  Trainable parameters: 665,858
  Trainable %: 0.98%

GPU Memory before training: 274.79 MB
Starting training...
GPU Memory before training: 274.79 MB
Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3325,0.309967,0.867983,0.878035
2,0.3049,0.294441,0.876596,0.887475
3,0.3052,0.28896,0.879418,0.891124
4,0.2594,0.279304,0.882536,0.893036
5,0.2811,0.275502,0.885061,0.895462
6,0.2578,0.272024,0.886843,0.896804
7,0.25,0.270282,0.888179,0.898749
8,0.2528,0.268471,0.889516,0.899946
9,0.2378,0.266866,0.89011,0.899374
10,0.2302,0.266725,0.888922,0.898919



Peak GPU Memory during training: 806.76 MB

Evaluating on test set...



Results for rank = 4:
  Test Accuracy: 0.8928
  Test F1: 0.9044
  Validation Accuracy: 0.8889
  Validation F1: 0.8989
  Total Training Time: 2422.04s
  Average Epoch Time: 228.25s
  Convergence Epoch (Best F1): 8
  Time to Convergence: 1825.61s
  Peak GPU Memory: 806.76 MB


Training LoRA model with rank = 8


Training LoRA model with rank = 8



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading base DistilBERT model...

Model Parameters:
  Total parameters: 67,694,596
  Trainable parameters: 739,586
  Trainable %: 1.09%

GPU Memory before training: 275.07 MB
Starting training...
GPU Memory before training: 275.07 MB
Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3273,0.303425,0.870805,0.88056
2,0.2947,0.287441,0.880606,0.891086
3,0.2968,0.282235,0.882536,0.894124
4,0.2492,0.271735,0.886249,0.896542
5,0.2704,0.267825,0.888625,0.898895
6,0.2474,0.263334,0.888773,0.898633
7,0.2391,0.261552,0.890704,0.900995
8,0.2414,0.259488,0.891298,0.901639
9,0.2261,0.257773,0.893377,0.902499
10,0.2185,0.257548,0.89308,0.90286



Peak GPU Memory during training: 808.36 MB

Evaluating on test set...



Results for rank = 8:
  Test Accuracy: 0.8989
  Test F1: 0.9096
  Validation Accuracy: 0.8931
  Validation F1: 0.9029
  Total Training Time: 2405.53s
  Average Epoch Time: 226.56s
  Convergence Epoch (Best F1): 10
  Time to Convergence: 2265.59s
  Peak GPU Memory: 808.36 MB


Training LoRA model with rank = 16


Training LoRA model with rank = 16



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading base DistilBERT model...

Model Parameters:
  Total parameters: 67,842,052
  Trainable parameters: 887,042
  Trainable %: 1.31%

GPU Memory before training: 275.63 MB
Starting training...
GPU Memory before training: 275.63 MB
Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3197,0.296221,0.875854,0.885479
2,0.2866,0.279939,0.88417,0.89468
3,0.2859,0.274902,0.888922,0.89992
4,0.2399,0.262944,0.891595,0.901829
5,0.2584,0.259316,0.893228,0.903529
6,0.2347,0.25301,0.897089,0.906085
7,0.2272,0.250688,0.896941,0.906519
8,0.2298,0.248927,0.897386,0.907161
9,0.2133,0.24666,0.900059,0.908795
10,0.205,0.246682,0.898723,0.907937



Peak GPU Memory during training: 812.39 MB

Evaluating on test set...



Results for rank = 16:
  Test Accuracy: 0.9050
  Test F1: 0.9146
  Validation Accuracy: 0.8987
  Validation F1: 0.9079
  Total Training Time: 2412.66s
  Average Epoch Time: 227.24s
  Convergence Epoch (Best F1): 9
  Time to Convergence: 2046.06s
  Peak GPU Memory: 812.39 MB


All experiments completed!

All experiments completed!


In [9]:
# Save all results to a summary file
import pandas as pd

# Create summary DataFrame
summary_data = []
for result in all_results:
    summary_data.append({
        "Rank": result["rank"],
        "Test Accuracy": f"{result['final_test_accuracy']:.4f}",
        "Test F1": f"{result['final_test_f1']:.4f}",
        "Val Accuracy": f"{result['final_val_accuracy']:.4f}",
        "Val F1": f"{result['final_val_f1']:.4f}",
        "Total Params": f"{result['total_parameters']:,}",
        "Trainable Params": f"{result['trainable_parameters']:,}",
        "Trainable Ratio%": f"{result['trainable_percentage']:.2f}%",
        "Total Time (s)": f"{result['total_training_time']:.2f}",
        "Avg Epoch Time (s)": f"{result['average_epoch_time']:.2f}",
        "Convergence Epoch": result['convergence_epoch'],
        "Time to Convergence (s)": f"{result['time_to_convergence']:.2f}",
        "Peak GPU Memory (MB)": f"{result['peak_gpu_memory_mb']:.2f}"
    })

summary_df = pd.DataFrame(summary_data)
print("\n" + "="*80)
print("BENCHMARK SUMMARY")
print("="*80)
print(summary_df.to_string(index=False))

# Save summary to CSV
summary_df.to_csv("lora_benchmark_summary.csv", index=False)
print("\n✓ Summary saved to 'lora_benchmark_summary.csv'")

# Save complete results
with open("lora_all_results.json", "w") as f:
    json.dump(all_results, f, indent=2)
print("✓ Complete results saved to 'lora_all_results.json'")


BENCHMARK SUMMARY
 Rank Test Accuracy Test F1 Val Accuracy Val F1 Total Params Trainable Params Trainable Ratio% Total Time (s) Avg Epoch Time (s)  Convergence Epoch Time to Convergence (s) Peak GPU Memory (MB)
    2        0.8897  0.9015       0.8852 0.8954   67,584,004          628,994            0.93%        2422.85             228.30                  8                 1827.12               808.04
    4        0.8928  0.9044       0.8889 0.8989   67,620,868          665,858            0.98%        2422.04             228.25                  8                 1825.61               806.76
    8        0.8989  0.9096       0.8931 0.9029   67,694,596          739,586            1.09%        2405.53             226.56                 10                 2265.59               808.36
   16        0.9050  0.9146       0.8987 0.9079   67,842,052          887,042            1.31%        2412.66             227.24                  9                 2046.06               812.39

✓ Summary saved