# LoRA Fine-tuning Benchmark: DistilBERT on IMDB

This notebook benchmarks LoRA adaptation methods for fine-tuning DistilBERT on the IMDB sentiment classification task.

**Configuration:**
- Base Model: DistilBERT
- Dataset: IMDB (Movie Reviews)
- Split Ratio: Train:Val:Test = 8:1:1
- LoRA Ranks: [2, 4, 8, 16]
- Training Epochs: 10
- Random Seed: 42

**Metrics Tracked:**
- Final Accuracy
- Running time per epoch
- Total training time
- Time to convergence
- GPU memory used
- Total parameters
- Trainable parameters
- Convergence epoch

In [None]:
# Import necessary libraries
import torch
import numpy as np
import random
import time
import json
import warnings
import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.metrics import accuracy_score
import gc

# Filter out specific warnings
warnings.filterwarnings('ignore', category=UserWarning, module='torch.nn.parallel._functions')

# Helper function to check if this is the main process (for multi-GPU training)
def is_main_process():
    # Check LOCAL_RANK environment variable
    # In distributed training: rank 0 is main process
    # In single process or DataParallel: LOCAL_RANK not set, return True
    local_rank = os.environ.get("LOCAL_RANK", None)
    if local_rank is None:
        return True
    return int(local_rank) == 0

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if is_main_process():
    print(f"Using device: {device}")
    if torch.cuda.is_available():
        print(f"Number of GPUs available: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"Initial GPU Memory: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Set random seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
print("Random seed set to 42")

Random seed set to 42


In [4]:
# Load and prepare IMDB dataset
print("Loading IMDB dataset...")
dataset = load_dataset("imdb")

# Get full training data
train_data = dataset["train"]
total_samples = len(train_data)

# Calculate split sizes (8:1:1)
train_size = int(0.8 * total_samples)
val_size = int(0.1 * total_samples)
test_size = total_samples - train_size - val_size

# Split the dataset
train_dataset = train_data.select(range(train_size))
val_dataset = train_data.select(range(train_size, train_size + val_size))
test_dataset = train_data.select(range(train_size + val_size, total_samples))

print(f"Dataset split complete:")
print(f"  Train: {len(train_dataset)} samples")
print(f"  Validation: {len(val_dataset)} samples")
print(f"  Test: {len(test_dataset)} samples")

Loading IMDB dataset...
Dataset split complete:
  Train: 20000 samples
  Validation: 2500 samples
  Test: 2500 samples
Dataset split complete:
  Train: 20000 samples
  Validation: 2500 samples
  Test: 2500 samples


In [5]:
# Tokenize the dataset
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

print("Tokenizing datasets...")
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

print("Tokenization complete!")

Loading tokenizer...
Tokenizing datasets...
Tokenization complete!
Tokenizing datasets...
Tokenization complete!


In [6]:
# Define compute metrics function
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="binary")
    }

In [7]:
# Custom Trainer to track metrics per epoch with early stopping
from transformers.trainer_callback import TrainerCallback

class MetricsCallback(TrainerCallback):
    def __init__(self, early_stop_patience=3):
        self.epoch_times = []
        self.epoch_accuracies = []
        self.epoch_f1s = []
        self.epoch_start_time = None
        self.best_f1 = 0.0
        self.best_f1_epoch = None
        self.logged_epochs = set()  # Track which epochs have been logged

        # Early stopping: stop when validation F1 fails to improve over best_f1 for `early_stop_patience` consecutive epochs
        self.early_stop_patience = early_stop_patience
        self.epochs_without_improvement = 0
        self.early_stopped = False

    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start_time = time.time()

    def on_epoch_end(self, args, state, control, **kwargs):
        epoch_time = time.time() - self.epoch_start_time
        self.epoch_times.append(epoch_time)

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        # Called after evaluation
        if 'eval_accuracy' in metrics and state.epoch > 0:
            accuracy = metrics['eval_accuracy']
            f1 = metrics.get('eval_f1', 0.0)
            current_epoch = int(state.epoch)

            # Only log data once per epoch
            if current_epoch not in self.logged_epochs:
                self.logged_epochs.add(current_epoch)
                self.epoch_accuracies.append(accuracy)
                self.epoch_f1s.append(f1)

                # Update best F1 and reset counter if improved
                if f1 > self.best_f1:
                    self.best_f1 = f1
                    self.best_f1_epoch = current_epoch
                    self.epochs_without_improvement = 0
                else:
                    # F1 did not improve over best_f1
                    self.epochs_without_improvement += 1

                # Trigger early stopping if no improvement for patience epochs
                if self.epochs_without_improvement >= self.early_stop_patience and not self.early_stopped:
                    self.early_stopped = True
                    # Request Trainer to stop training after this evaluation
                    control.should_training_stop = True
                    if is_main_process():
                        print(f"\n[Early Stopping] No improvement over best F1 for {self.epochs_without_improvement} consecutive epochs. Stopping training at epoch {current_epoch}.")
                        print(f"[Early Stopping Info] Best F1: {self.best_f1:.4f} at epoch {self.best_f1_epoch}\n")

In [8]:
# Function to train and evaluate LoRA model with a specific rank
def train_lora_model(rank, epochs=30, resume_from_checkpoint=False):
    print(f"\n{'='*80}")
    print(f"Training LoRA model with rank = {rank}")
    print(f"{'='*80}\n")
    
    # Reset seed for each run
    set_seed(42)
    
    # Clear GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
    
    # Check for existing checkpoint
    output_dir = f"./results_imdb_lora_rank_{rank}"
    checkpoint_dir = None
    if resume_from_checkpoint and os.path.exists(output_dir):
        # Find the latest checkpoint
        checkpoints = [d for d in os.listdir(output_dir) if d.startswith("checkpoint-")]
        if checkpoints:
            # Sort by epoch number (checkpoint-XXXX)
            checkpoints.sort(key=lambda x: int(x.split("-")[1]))
            checkpoint_dir = os.path.join(output_dir, checkpoints[-1])
            print(f"Found existing checkpoint: {checkpoint_dir}")
            print(f"Resuming training from this checkpoint...\n")
    
    # Load base model
    print("Loading base DistilBERT model...")
    base_model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=2
    )
    
    # Configure LoRA
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=rank,
        lora_alpha=rank * 2,  # Common practice: alpha = 2 * r
        lora_dropout=0.1,
        target_modules=["q_lin", "v_lin"],  # DistilBERT attention modules
    )
    
    # Apply LoRA to the model
    model = get_peft_model(base_model, lora_config)
    
    # Print model parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"\nModel Parameters:")
    print(f"  Total parameters: {total_params:,}")
    print(f"  Trainable parameters: {trainable_params:,}")
    print(f"  Trainable %: {100 * trainable_params / total_params:.2f}%\n")
    
    # Move model to device
    model.to(device)

    # Record GPU memory before training (print once using is_main_process)
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
        initial_memory = torch.cuda.memory_allocated() / 1024**2
        if is_main_process():
            print(f"GPU Memory before training: {initial_memory:.2f} MB")

    # Training arguments - configured for multi-GPU
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=5e-4,
        weight_decay=0.01,
        logging_steps=100,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=2,  # Keep 2 checkpoints to prevent data loss
        seed=42,
        report_to="none",
        disable_tqdm=False,
        # Multi-GPU settings
        local_rank=-1,  # Let Trainer handle device placement
        dataloader_num_workers=4,  # Parallel data loading
        dataloader_pin_memory=True,
    )

    # Create metrics callback
    metrics_callback = MetricsCallback()

    # Create trainer with callback
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics,
        callbacks=[metrics_callback],
    )

    # Print starting message
    if is_main_process():
        if checkpoint_dir:
            print(f"Resuming training from checkpoint...")
        else:
            print("Starting training from scratch...")
        if torch.cuda.device_count() > 1:
            print(f"Using {torch.cuda.device_count()} GPUs for training")

    start_time = time.time()
    # Resume from checkpoint if available
    trainer.train(resume_from_checkpoint=checkpoint_dir if checkpoint_dir else None)
    total_training_time = time.time() - start_time

    # Get peak GPU memory
    if torch.cuda.is_available():
        peak_memory = torch.cuda.max_memory_allocated() / 1024**2
        print(f"\nPeak GPU Memory during training: {peak_memory:.2f} MB")
    else:
        peak_memory = 0

    # Evaluate on test set
    print("\nEvaluating on test set...")
    test_results = trainer.evaluate(tokenized_test)
    test_accuracy = test_results['eval_accuracy']
    test_f1 = test_results.get('eval_f1', 0.0)

    # Calculate time to convergence / time to early stop
    # When early stopped, the training has already stopped, so use total_training_time
    time_to_convergence = total_training_time

    # Get final validation accuracy and F1
    if len(metrics_callback.epoch_accuracies) > 0:
        final_val_accuracy = metrics_callback.epoch_accuracies[-1]
        final_val_f1 = metrics_callback.epoch_f1s[-1]
    else:
        # If epoch_accuracies is empty, evaluate manually
        val_results = trainer.evaluate(tokenized_val)
        final_val_accuracy = val_results['eval_accuracy']
        final_val_f1 = val_results.get('eval_f1', 0.0)

    # Compile results
    results = {
        "rank": rank,
        "final_test_accuracy": test_accuracy,
        "final_test_f1": test_f1,
        "final_val_accuracy": final_val_accuracy,
        "final_val_f1": final_val_f1,
        "total_parameters": total_params,
        "trainable_parameters": trainable_params,
        "trainable_percentage": 100 * trainable_params / total_params,
        "total_training_time": total_training_time,
        "average_epoch_time": np.mean(metrics_callback.epoch_times) if len(metrics_callback.epoch_times) > 0 else 0,
        "epoch_times": metrics_callback.epoch_times,
        "epoch_accuracies": metrics_callback.epoch_accuracies,
        "epoch_f1s": metrics_callback.epoch_f1s,
        "peak_gpu_memory_mb": peak_memory,
        "early_stopped": metrics_callback.early_stopped,
        "best_f1_epoch": metrics_callback.best_f1_epoch,
        "best_val_f1": metrics_callback.best_f1,
    }

    print(f"\n{'='*80}")
    print(f"Results for rank = {rank}:")
    print(f"  Test Accuracy: {test_accuracy:.4f}")
    print(f"  Test F1: {test_f1:.4f}")
    print(f"  Validation Accuracy: {final_val_accuracy:.4f}")
    print(f"  Validation F1: {final_val_f1:.4f}")
    print(f"  Total Training Time: {total_training_time:.2f}s")
    print(f"  Average Epoch Time: {results['average_epoch_time']:.2f}s")
    print(f"  Best F1 Epoch: {metrics_callback.best_f1_epoch}")
    print(f"  Best Val F1: {metrics_callback.best_f1:.4f}")
    print(f"  Early Stopped: {metrics_callback.early_stopped}")
    if metrics_callback.early_stopped:
        print(f"  Stop Info: epochs_without_improvement={metrics_callback.epochs_without_improvement}")
    print(f"  Peak GPU Memory: {peak_memory:.2f} MB")
    print(f"{'='*80}\n")

    # Clean up
    del model
    del trainer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

    return results

In [9]:
# Run benchmarks for all ranks
ranks_to_test = [2, 4, 8, 16]
all_results = []

print("Starting LoRA benchmark experiments...")
print(f"Testing ranks: {ranks_to_test}")
print(f"Training epochs: 30")
print(f"Random seed: 42\n")

for rank in ranks_to_test:
    # Set resume_from_checkpoint=True to enable automatic resume if needed
    results = train_lora_model(rank, epochs=30)
    all_results.append(results)

    # Save individual result
    with open(f"imdb_lora_rank_{rank}_results.json", "w") as f:
        json.dump(results, f, indent=2)

print("\n" + "="*80)
print("All experiments completed!")
print("="*80)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting LoRA benchmark experiments...
Testing ranks: [2, 4, 8, 16]
Training epochs: 10
Random seed: 42


Training LoRA model with rank = 2

Loading base DistilBERT model...

Model Parameters:
  Total parameters: 67,584,004
  Trainable parameters: 628,994
  Trainable %: 0.93%

GPU Memory before training: 258.90 MB

Model Parameters:
  Total parameters: 67,584,004
  Trainable parameters: 628,994
  Trainable %: 0.93%

GPU Memory before training: 258.90 MB
Starting training...
Using 2 GPUs for training
Starting training...
Using 2 GPUs for training


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting LoRA benchmark experiments...
Testing ranks: [2, 4, 8, 16]
Training epochs: 10
Random seed: 42


Training LoRA model with rank = 2

Loading base DistilBERT model...

Model Parameters:
  Total parameters: 67,584,004
  Trainable parameters: 628,994
  Trainable %: 0.93%

GPU Memory before training: 258.90 MB

Model Parameters:
  Total parameters: 67,584,004
  Trainable parameters: 628,994
  Trainable %: 0.93%

GPU Memory before training: 258.90 MB
Starting training...
Using 2 GPUs for training
Starting training...
Using 2 GPUs for training


RuntimeError: NCCL Error 5: invalid usage (run with NCCL_DEBUG=WARN for details)

KeyboardInterrupt: 

In [None]:
# Save all results to a summary file
import pandas as pd

# Create summary DataFrame
summary_data = []
for result in all_results:
    # Check if this is old data without best_val_f1
    if 'best_val_f1' not in result:
        print(f"Warning: Result for rank {result['rank']} is missing 'best_val_f1'. Please re-run training.")
        best_val_f1_str = "N/A"
        best_f1_epoch_str = "N/A"
    else:
        best_val_f1_str = f"{result['best_val_f1']:.4f}"
        best_f1_epoch_str = str(result['best_f1_epoch'])
    
    summary_data.append({
        "Rank": result["rank"],
        "Test Acc": f"{result['final_test_accuracy']:.4f}",
        "Test F1": f"{result['final_test_f1']:.4f}",
        "Val Acc": f"{result['final_val_accuracy']:.4f}",
        "Val F1": f"{result['final_val_f1']:.4f}",
        "Best Val F1": best_val_f1_str,
        "Best F1 Epoch": best_f1_epoch_str,
        "Trainable Params": f"{result['trainable_parameters']:,}",
        "Trainable %": f"{result['trainable_percentage']:.2f}%",
        "Total Time (s)": f"{result['total_training_time']:.2f}",
        "Avg Epoch (s)": f"{result['average_epoch_time']:.2f}",
        "Early Stopped": "Yes" if result.get('early_stopped', False) else "No",
        "Peak GPU (MB)": f"{result['peak_gpu_memory_mb']:.2f}"
    })

summary_df = pd.DataFrame(summary_data)
print("\n" + "="*80)
print("BENCHMARK SUMMARY")
print("="*80)
print(summary_df.to_string(index=False))

# Save summary to CSV
summary_df.to_csv("imdb_lora_benchmark_summary.csv", index=False)
print("\n✓ Summary saved to 'imdb_lora_benchmark_summary.csv'")

# Save complete results
with open("imdb_lora_all_results.json", "w") as f:
    json.dump(all_results, f, indent=2)
print("✓ Complete results saved to 'imdb_lora_all_results.json'")


BENCHMARK SUMMARY
 Rank Test Accuracy Test F1 Val Accuracy Val F1 Total Params Trainable Params Trainable Ratio% Total Time (s) Avg Epoch Time (s)  Convergence Epoch Time to Convergence (s) Peak GPU Memory (MB)
    2        0.8897  0.9015       0.8852 0.8954   67,584,004          628,994            0.93%        2422.85             228.30                  8                 1827.12               808.04
    4        0.8928  0.9044       0.8889 0.8989   67,620,868          665,858            0.98%        2422.04             228.25                  8                 1825.61               806.76
    8        0.8989  0.9096       0.8931 0.9029   67,694,596          739,586            1.09%        2405.53             226.56                 10                 2265.59               808.36
   16        0.9050  0.9146       0.8987 0.9079   67,842,052          887,042            1.31%        2412.66             227.24                  9                 2046.06               812.39

✓ Summary saved