# Mini LoRA Benchmark Kit (Colab Version)

This notebook demonstrates a tiny local LoRA fine-tuning workflow using `distilbert-base-uncased` on a 12-sample binary classification task, then benchmarks inference latency, token throughput, and quality (accuracy/F1) vs the base model.

First, we'll set up Google Drive and install dependencies:

In [None]:
# Mount Google Drive (needed to save results)
from google.colab import drive
drive.mount('/content/drive')

# Create project directory
!mkdir -p /content/mini_lora_benchmark
!mkdir -p /content/mini_lora_benchmark/data
!mkdir -p /content/mini_lora_benchmark/results

In [None]:
# Install required packages
!pip install -q transformers>=4.30.0 datasets>=2.8.0 peft>=0.3.0 accelerate>=0.20.0 scikit-learn>=1.1.0 pandas>=1.3.0 torch>=1.12.0

In [None]:
# Create mini dataset
mini_dataset = """
text,label
"I love this product, it works great!",1
"Terrible quality, broke after a day.",0
"Excellent value for money.",1
"I wouldn't buy this again.",0
"Very happy with the purchase.",1
"It was okay, nothing special.",0
"Exceeded my expectations.",1
"Not what I expected, disappointing.",0
"Five stars, highly recommend.",1
"One star. Do not recommend.",0
"Works fine for the price.",1
"Poorly made and slow.",0
"""

with open('/content/mini_lora_benchmark/data/mini_dataset.csv', 'w') as f:
    f.write(mini_dataset.strip())

print("Created dataset at /content/mini_lora_benchmark/data/mini_dataset.csv")

In [None]:
import time
import json
import math
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score, f1_score, classification_report
import torch
print('Imports OK')

In [None]:
# Load the mini dataset from CSV
df = pd.read_csv('/content/mini_lora_benchmark/data/mini_dataset.csv')
df['label'] = df['label'].astype(int)
df

In [None]:
# Create a small train/test split (10 train / 2 test)
train_df = df.sample(frac=0.83, random_state=42)  # ~10 samples
test_df = df.drop(train_df.index)
train = Dataset.from_pandas(train_df.reset_index(drop=True))
test = Dataset.from_pandas(test_df.reset_index(drop=True))
train, test

In [None]:
# Tokenizer and model init
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Base model (we'll clone this for comparing base vs LoRA)
base_model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

def preprocess(batch):
    toks = tokenizer(batch['text'], truncation=True, padding=False)
    toks['labels'] = batch['label']
    return toks

train = train.map(preprocess, batched=True)
test = test.map(preprocess, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)

print('Tokenization done')

In [None]:
# Training arguments (tiny)
training_args = TrainingArguments(
    output_dir='/content/mini_lora_benchmark/outputs',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=5e-5,
    logging_steps=10,
    save_strategy='no',
    disable_tqdm=False,
    fp16=torch.cuda.is_available()  # Enable fp16 if CUDA available
)

def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(pred.label_ids, preds)
    f1 = f1_score(pred.label_ids, preds, zero_division=0)
    return {'accuracy': acc, 'f1': f1}

print('Training args and metric function prepared')

### 1) Evaluate base model performance and inference latency

In [None]:
# Evaluate base model (no further training)
trainer_base = Trainer(
    model=base_model,
    args=training_args,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

eval_base = trainer_base.evaluate()
eval_base

In [None]:
# Measure inference latency and tokens/sec for the base model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
base_model.to(device)

def measure_latency_and_throughput(model, tokenizer, texts, device, repeat=20):
    # ensure texts are Python strings
    texts = [str(t) for t in texts]
    # Warmup
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)
    for _ in range(3):
        _ = model(**inputs)
    # Measure
    t0 = time.time()
    for _ in range(repeat):
        _ = model(**inputs)
    t_total = time.time() - t0
    avg_latency = t_total / repeat
    # tokens per second = total input tokens across batch / avg_latency
    batch_token_count = sum([len(tokenizer(tok)['input_ids']) for tok in texts])
    tokens_per_sec = batch_token_count / avg_latency
    return avg_latency, tokens_per_sec

sample_texts = test['text'] if len(test) > 0 else train['text'][:2]
base_latency, base_tps = measure_latency_and_throughput(base_model, tokenizer, sample_texts, device)
{'latency_s': base_latency, 'tokens_per_sec': base_tps}

### 2) Apply LoRA (PEFT) and fine-tune on the mini dataset

In [None]:
# Create a fresh base model copy to apply LoRA to (avoid modifying the earlier 'base_model' used for eval)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

# LoRA config (very small for demo)
# DistilBERT attention uses q_lin/k_lin/v_lin naming
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_lin", "k_lin", "v_lin"],  # DistilBERT attention layer names
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

peft_model = get_peft_model(model, lora_config)
print('PEFT/LoRA model created')

In [None]:
# Trainer for LoRA model
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Fine-tune (very small; fast)
trainer.train()

# Evaluate the fine-tuned LoRA model
eval_lora = trainer.evaluate()
eval_lora

In [None]:
# Measure LoRA model latency and tokens/sec
peft_model.to(device)
lora_latency, lora_tps = measure_latency_and_throughput(peft_model, tokenizer, sample_texts, device)
{'latency_s': lora_latency, 'tokens_per_sec': lora_tps}

In [None]:
# Compare outputs on test set (detailed)
def get_preds(model, dataset):
    model.eval()
    preds = []
    labels = []
    for i in range(0, len(dataset)):
        item = {k: torch.tensor([v]).to(device) for k, v in dataset[i].items() if k in ['input_ids','attention_mask']}
        with torch.no_grad():
            out = model(**item)
            logits = out.logits.cpu().numpy()[0]
            preds.append(int(np.argmax(logits)))
            labels.append(int(dataset[i]['labels']))
    return preds, labels

base_model.to(device)
base_preds, base_labels = get_preds(base_model, test)
lora_preds, lora_labels = get_preds(peft_model, test)

print("Base classification report:\n", classification_report(base_labels, base_preds, zero_division=0))
print('\nLoRA classification report:\n', classification_report(lora_labels, lora_preds, zero_division=0))

In [None]:
# Save results to JSON and CSV in Google Drive
results = {
    'base_eval': eval_base,
    'lora_eval': eval_lora,
    'base_latency_s': base_latency,
    'base_tokens_per_sec': base_tps,
    'lora_latency_s': lora_latency,
    'lora_tokens_per_sec': lora_tps,
    'test_size': len(test)
}

# Save to local results directory
with open('/content/mini_lora_benchmark/results/results.json', 'w') as f:
    json.dump(results, f, indent=2)

# Also produce a flat CSV summary
pd.DataFrame([{
    'model': 'base',
    'accuracy': eval_base.get('eval_accuracy', None),
    'f1': eval_base.get('eval_f1', None),
    'latency_s': base_latency,
    'tokens_per_sec': base_tps
}, {
    'model': 'lora',
    'accuracy': eval_lora.get('eval_accuracy', None),
    'f1': eval_lora.get('eval_f1', None),
    'latency_s': lora_latency,
    'tokens_per_sec': lora_tps
}]).to_csv('/content/mini_lora_benchmark/results/results.csv', index=False)

# Optional: Copy results to Google Drive
!cp -r /content/mini_lora_benchmark/results/* /content/drive/MyDrive/mini_lora_benchmark/results/

print('Saved results locally and to Google Drive')

## Notes and limitations
- This demo uses a tiny dataset (12 samples) and only demonstrates the pipeline. Results are not statistically meaningful.
- Using DistilBERT for sequence classification shows how LoRA works on an encoder; for per-token generative latency (e.g., tokens/sec generated), use a causal LM (distilgpt2 or a small Mistral) in a follow-up.
- LoRA hyperparameters here are small to keep training quick; adjust `r`, `lora_alpha`, and `target_modules` for your workloads.
- In Colab, you'll likely get much better performance using the GPU runtime (Runtime → Change runtime type → GPU).