# Day 2: QLoRA Fine-Tuning

This notebook implements **QLoRA (Quantized LoRA)** - the most memory-efficient fine-tuning method.

**Method**: QLoRA combines 4-bit quantization with LoRA for maximum efficiency

**Model**: mistralai/Mistral-7B-v0.1 (4-bit quantized)

**Expected Time**: 1-2 hours

**GPU Required**: T4 (15GB) - uses ~4GB memory!

**Target Accuracy**: 75-88% (similar to LoRA, 75% less memory)

## 1. Setup Environment

In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Install dependencies (including bitsandbytes for quantization)
!pip install -q torch transformers accelerate peft bitsandbytes datasets evaluate scikit-learn pandas numpy wandb trl

In [None]:
# Import libraries
import torch
import pandas as pd
import numpy as np
import json
import time
from datetime import datetime
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, classification_report
import wandb

print(f"‚úÖ PyTorch version: {torch.__version__}")
print(f"‚úÖ CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"‚úÖ GPU: {torch.cuda.get_device_name(0)}")
    print(f"‚úÖ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Mount Google Drive and Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load data
data_path = '/content/drive/MyDrive/Colab Notebooks/llm-finetuning-showdown/processed'

train_df = pd.read_csv(f'{data_path}/train.csv')
val_df = pd.read_csv(f'{data_path}/val.csv')
test_df = pd.read_csv(f'{data_path}/test.csv')

with open(f'{data_path}/label_mapping.json', 'r') as f:
    label_info = json.load(f)

# Convert text labels to numeric IDs
label_to_id = label_info['label_to_id']
train_df['label'] = train_df['label'].map(label_to_id)
val_df['label'] = val_df['label'].map(label_to_id)
test_df['label'] = test_df['label'].map(label_to_id)

print(f"‚úÖ Train samples: {len(train_df)}")
print(f"‚úÖ Val samples: {len(val_df)}")
print(f"‚úÖ Test samples: {len(test_df)}")
print(f"\n‚úÖ Number of categories: {label_info['num_labels']}")
print(f"\n‚úÖ Labels converted to numeric IDs")
print(f"   First label (should be 0-24): {train_df['label'].iloc[0]}")

## 3. Initialize Weights & Biases

In [None]:
# Login to W&B
wandb.login()

# Initialize project
wandb.init(
    project="llm-finetuning-showdown",
    name="qlora-finetuning",
    config={
        "method": "qlora",
        "model": "mistralai/Mistral-7B-v0.1",
        "quantization": "4-bit",
        "task": "resume_classification",
        "num_labels": label_info['num_labels'],
        "learning_rate": 2e-4,
        "batch_size": 8,
        "epochs": 3,
        "lora_r": 8,
        "lora_alpha": 16
    }
)

## 4. Load Model with 4-bit Quantization + LoRA

In [None]:
model_name = "mistralai/Mistral-7B-v0.1"

print(f"Loading model with 4-bit quantization: {model_name}")

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,  # Double quantization for even more memory savings
    bnb_4bit_quant_type="nf4",  # NormalFloat4 quantization
    bnb_4bit_compute_dtype=torch.bfloat16  # Compute in bfloat16
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load quantized model
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=label_info['num_labels'],
    quantization_config=bnb_config,
    device_map="auto"
)
base_model.config.pad_token_id = tokenizer.pad_token_id

# Prepare model for k-bit training
base_model = prepare_model_for_kbit_training(base_model)

print(f"‚úÖ 4-bit quantized model loaded")
print(f"üíæ Memory footprint reduced by ~75%!")

In [None]:
# Configure LoRA (same as before)
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "v_proj"]
)

# Apply LoRA to quantized model
model = get_peft_model(base_model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
trainable_percent = 100 * trainable_params / total_params

print(f"\n‚úÖ QLoRA Configuration Applied")
print(f"üìä Trainable parameters: {trainable_params:,} ({trainable_percent:.2f}%)")
print(f"üìä Total parameters: {total_params:,}")
print(f"üéØ Parameter reduction: {100 - trainable_percent:.2f}%")
print(f"üíæ Using 4-bit quantization + LoRA = Maximum efficiency!")

## 5. Prepare Dataset

In [None]:
# Tokenization function - NO PADDING HERE (let Trainer handle it)
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=512
    )

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize
print("Tokenizing datasets...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Clean up: Remove text column (no longer needed)
train_dataset = train_dataset.remove_columns(['text'])
val_dataset = val_dataset.remove_columns(['text'])
test_dataset = test_dataset.remove_columns(['text'])

# Rename 'label' to 'labels' (required by Trainer)
train_dataset = train_dataset.rename_column('label', 'labels')
val_dataset = val_dataset.rename_column('label', 'labels')
test_dataset = test_dataset.rename_column('label', 'labels')

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

print(f"‚úÖ Datasets tokenized and ready")
print(f"   Final columns: {train_dataset.column_names}")

## 6. Define Metrics and Training Arguments

In [None]:
# Compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted'
    )
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments - Optimized for A100 80GB (QLoRA uses even less memory)
training_args = TrainingArguments(
    output_dir="./results_qlora",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,  # Higher LR for QLoRA
    per_device_train_batch_size=16,  # Larger batch size for QLoRA on A100
    per_device_eval_batch_size=32,   # Larger batch size for QLoRA on A100
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    warmup_steps=50,
    fp16=False,  # Don't use fp16 with 4-bit quantization
    bf16=True,   # Use bfloat16 instead
    report_to="wandb",
    run_name="qlora-finetuning"
)

print("‚úÖ Training arguments configured")
print("üöÄ Optimized for A100 80GB: batch_size=16, eval_batch_size=32")

## 7. Train Model with QLoRA

In [None]:
# Create data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("üöÄ Starting QLoRA training...")
print(f"üìä Training samples: {len(train_dataset)}")
print(f"üìä Validation samples: {len(val_dataset)}")
print(f"üíæ Using 4-bit quantization for maximum memory efficiency")
print(f"‚è∞ Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Track training time
start_time = time.time()

# Train
train_result = trainer.train()

# Calculate training time
training_time = time.time() - start_time
training_hours = training_time / 3600

print(f"\n‚úÖ QLoRA training completed!")
print(f"‚è∞ Training time: {training_hours:.2f} hours ({training_time:.2f} seconds)")
print(f"üìà Final training loss: {train_result.training_loss:.4f}")

## 8. Evaluate on Test Set

In [None]:
# Evaluate on test set
print("üß™ Evaluating on test set...")
test_results = trainer.evaluate(test_dataset)

print("\n" + "="*50)
print("QLoRA FINE-TUNING RESULTS")
print("="*50)
print(f"Test Accuracy: {test_results['eval_accuracy']:.4f} ({test_results['eval_accuracy']*100:.2f}%)")
print(f"Test F1-Score: {test_results['eval_f1']:.4f}")
print(f"Test Precision: {test_results['eval_precision']:.4f}")
print(f"Test Recall: {test_results['eval_recall']:.4f}")
print(f"\nTraining Time: {training_hours:.2f} hours")
print(f"Trainable Parameters: {trainable_percent:.2f}% of total")
print(f"Quantization: 4-bit (75% memory reduction)")
print(f"\nBaseline Accuracy: 73.00%")
print(f"Improvement over Baseline: +{(test_results['eval_accuracy']*100 - 73):.2f}%")

# Get detailed predictions
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

# Classification report
print("\nDetailed Classification Report:")
id_to_label = {v: k for k, v in label_info['label_to_id'].items()}
target_names = [id_to_label[i] for i in range(label_info['num_labels'])]
print(classification_report(true_labels, pred_labels, target_names=target_names))

## 9. Save Results and QLoRA Adapter

In [None]:
# Save results to Google Drive
results_path = '/content/drive/MyDrive/Colab Notebooks/llm-finetuning-showdown'

qlora_results = {
    "method": "qlora",
    "model": model_name,
    "quantization": "4-bit NF4",
    "date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    "accuracy": float(test_results['eval_accuracy']),
    "f1_score": float(test_results['eval_f1']),
    "precision": float(test_results['eval_precision']),
    "recall": float(test_results['eval_recall']),
    "training_time_hours": float(training_hours),
    "training_time_seconds": float(training_time),
    "baseline_accuracy": 0.73,
    "improvement_over_baseline": float(test_results['eval_accuracy'] - 0.73),
    "total_parameters": total_params,
    "trainable_parameters": trainable_params,
    "trainable_percent": float(trainable_percent),
    "qlora_config": {
        "quantization": "4-bit",
        "quant_type": "nf4",
        "double_quant": True,
        "compute_dtype": "bfloat16",
        "r": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.1,
        "target_modules": ["q_proj", "v_proj"]
    },
    "training_config": {
        "learning_rate": 2e-4,
        "batch_size": 8,
        "epochs": 3,
        "max_length": 512
    }
}

with open(f'{results_path}/qlora_results.json', 'w') as f:
    json.dump(qlora_results, f, indent=2)

print(f"‚úÖ Results saved to: {results_path}/qlora_results.json")

# Save QLoRA adapter
model.save_pretrained(f'{results_path}/qlora_adapter')
tokenizer.save_pretrained(f'{results_path}/qlora_adapter')
print(f"‚úÖ QLoRA adapter saved to: {results_path}/qlora_adapter")
print(f"üì¶ Adapter size: ~10-50 MB (quantized base model not saved)")

# Log to W&B
wandb.log({
    "final_test_accuracy": test_results['eval_accuracy'],
    "final_test_f1": test_results['eval_f1'],
    "training_time_hours": training_hours,
    "improvement_over_baseline": test_results['eval_accuracy'] - 0.73,
    "trainable_percent": trainable_percent,
    "memory_reduction": "75%"
})

wandb.finish()
print("\n‚úÖ QLoRA Fine-Tuning Complete!")

## 10. Final Comparison

**‚úÖ All Three Methods Complete!**

Load your results from Google Drive and compare:

```python
import json

results_path = '/content/drive/MyDrive/Colab Notebooks/llm-finetuning-showdown'

with open(f'{results_path}/baseline_results.json', 'r') as f:
    baseline = json.load(f)

with open(f'{results_path}/full_ft_results.json', 'r') as f:
    full_ft = json.load(f)

with open(f'{results_path}/lora_results.json', 'r') as f:
    lora = json.load(f)

with open(f'{results_path}/qlora_results.json', 'r') as f:
    qlora = json.load(f)

print("\n" + "="*60)
print("FINAL COMPARISON")
print("="*60)
print(f"Baseline:     {baseline['accuracy']*100:.2f}% | 0h training")
print(f"Full FT:      {full_ft['accuracy']*100:.2f}% | {full_ft['training_time_hours']:.2f}h training")
print(f"LoRA:         {lora['accuracy']*100:.2f}% | {lora['training_time_hours']:.2f}h training")
print(f"QLoRA:        {qlora['accuracy']*100:.2f}% | {qlora['training_time_hours']:.2f}h training")
```

**Next Steps:**
- Day 3: Create visualizations and write Medium article
- Day 4: Write arXiv paper
- Day 5: Publish everything!