# LLM Finetuning Quick Start - All-in-One Notebook

This notebook contains complete, executable code to get you started with LLM finetuning immediately.

**What's included:**
1. Environment setup
2. Data exploration
3. Baseline evaluation
4. Full finetuning (GPT-2)
5. LoRA finetuning

**Time:** 1-2 hours total

**Just run all cells sequentially!**

## Part 1: Setup (5 minutes)

In [None]:
# Clone repository
!git clone https://github.com/DS535/llm-finetuning-production.git
%cd llm-finetuning-production

In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Install dependencies (5-10 minutes)
!pip install -q -r requirements.txt
print("âœ“ Dependencies installed")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
os.makedirs("/content/drive/MyDrive/llm_checkpoints", exist_ok=True)
print("âœ“ Drive mounted")

In [None]:
# Verify installation
import transformers
import datasets
import peft
import torch

print(f"transformers: {transformers.__version__}")
print(f"datasets: {datasets.__version__}")
print(f"peft: {peft.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print("\nâœ“ Setup complete!")

## Part 2: Data Exploration (10 minutes)

In [None]:
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np

# Load instruction dataset
print("Loading Dolly-15k...")
dolly = load_dataset("databricks/databricks-dolly-15k", split="train")
print(f"Loaded {len(dolly):,} examples")
print(f"\nFirst example:\n{dolly[0]}")

In [None]:
# Analyze text lengths
from collections import Counter

categories = Counter(dolly['category'])
plt.figure(figsize=(12, 5))
plt.bar(categories.keys(), categories.values())
plt.xticks(rotation=45, ha='right')
plt.title('Instruction Categories')
plt.tight_layout()
plt.show()

print("\nTop categories:")
for cat, count in categories.most_common(5):
    print(f"  {cat}: {count:,}")

In [None]:
# Tokenization analysis
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Sample 1000 examples
sample = dolly.select(range(1000))
token_counts = [
    len(tokenizer.encode(ex['instruction'] + ' ' + ex['response']))
    for ex in sample
]

plt.hist(token_counts, bins=50)
plt.axvline(512, color='red', linestyle='--', label='512 limit')
plt.xlabel('Token Count')
plt.ylabel('Frequency')
plt.title(f'Token Distribution (Mean: {np.mean(token_counts):.0f})')
plt.legend()
plt.show()

pct_over_512 = sum(1 for x in token_counts if x > 512) / len(token_counts) * 100
print(f"\nExamples > 512 tokens: {pct_over_512:.1f}%")
print("âœ“ Data exploration complete")

## Part 3: Baseline Evaluation (10 minutes)

In [None]:
from transformers import AutoModelForCausalLM
from tqdm import tqdm

# Load GPT-2
print("Loading GPT-2...")
model = AutoModelForCausalLM.from_pretrained("gpt2").to("cuda")
tokenizer.pad_token = tokenizer.eos_token

print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# Test generation
test_prompts = [
    "The capital of France is",
    "To learn Python, you should",
    "The best way to stay healthy is"
]

print("Zero-shot generation:\n")
for prompt in test_prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_length=50, do_sample=True, temperature=0.7)
    print(f"Prompt: {prompt}")
    print(f"Output: {tokenizer.decode(outputs[0])}\n")

In [None]:
# Compute baseline perplexity
model.eval()
test_texts = [ex['response'] for ex in dolly.select(range(100))]

total_loss = 0
total_tokens = 0

with torch.no_grad():
    for text in tqdm(test_texts, desc="Computing perplexity"):
        enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to("cuda")
        outputs = model(**enc, labels=enc["input_ids"])
        total_loss += outputs.loss.item() * enc["input_ids"].size(1)
        total_tokens += enc["input_ids"].size(1)

baseline_perplexity = np.exp(total_loss / total_tokens)
print(f"\nBaseline Perplexity: {baseline_perplexity:.2f}")

del model
torch.cuda.empty_cache()
print("âœ“ Baseline established")

## Part 4: Full Finetuning GPT-2 (30-45 minutes)

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load fresh model
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Prepare small dataset for quick training
tiny_stories = load_dataset("roneneldan/TinyStories", split="train[:5000]")
tiny_split = tiny_stories.train_test_split(test_size=0.1, seed=42)

print(f"Train: {len(tiny_split['train'])}, Val: {len(tiny_split['test'])}")

In [None]:
# Tokenize
def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized = tiny_split.map(tokenize, batched=True, remove_columns=["text"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

print("âœ“ Data tokenized")

In [None]:
# Configure training
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/llm_checkpoints/gpt2_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    warmup_steps=100,
    logging_steps=50,
    eval_steps=200,
    save_steps=200,
    evaluation_strategy="steps",
    fp16=True,
    report_to="none",
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    data_collator=data_collator
)

print("âœ“ Trainer configured")

In [None]:
# Train!
print("Starting training...")
trainer.train()
print("\nâœ“ Training complete!")

In [None]:
# Test finetuned model
model.eval()

story_prompts = [
    "Once upon a time",
    "The little girl",
    "In a magical forest"
]

print("\nFinetuned generation:\n")
for prompt in story_prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_length=100, do_sample=True, temperature=0.7)
    print(f"Prompt: {prompt}")
    print(f"Story: {tokenizer.decode(outputs[0])}\n")

In [None]:
# Save model
save_path = "/content/drive/MyDrive/llm_models/gpt2_tinystories"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"âœ“ Model saved to {save_path}")

del model
torch.cuda.empty_cache()

## Part 5: LoRA Finetuning (20-30 minutes)

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

# Load model for LoRA
base_model = AutoModelForCausalLM.from_pretrained("gpt2")

# Configure LoRA
lora_config = LoraConfig(
    r=16,  # LoRA rank
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],  # GPT-2 attention modules
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Apply LoRA
model_lora = get_peft_model(base_model, lora_config)
model_lora.print_trainable_parameters()

print("âœ“ LoRA applied")

In [None]:
# Train with LoRA
training_args_lora = TrainingArguments(
    output_dir="/content/drive/MyDrive/llm_checkpoints/gpt2_lora",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,  # Higher LR for LoRA
    warmup_steps=100,
    logging_steps=50,
    fp16=True,
    report_to="none",
    save_total_limit=2
)

trainer_lora = Trainer(
    model=model_lora,
    args=training_args_lora,
    train_dataset=tokenized["train"],
    data_collator=data_collator
)

print("Starting LoRA training...")
trainer_lora.train()
print("\nâœ“ LoRA training complete!")

In [None]:
# Test LoRA model
model_lora.eval()

print("\nLoRA generation:\n")
for prompt in story_prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model_lora.generate(**inputs, max_length=100)
    print(f"Prompt: {prompt}")
    print(f"Story: {tokenizer.decode(outputs[0])}\n")

In [None]:
# Save LoRA adapters
lora_path = "/content/drive/MyDrive/llm_models/gpt2_lora_adapters"
model_lora.save_pretrained(lora_path)
print(f"âœ“ LoRA adapters saved to {lora_path}")
print("  (Only adapters saved - much smaller than full model!)")

## Summary & Next Steps

In [None]:
print("=" * 60)
print("QUICK START COMPLETE!")
print("=" * 60)

print("\nâœ“ What you learned:")
print("  1. Environment setup for LLM finetuning")
print("  2. Dataset loading and exploration")
print("  3. Baseline evaluation")
print("  4. Full parameter finetuning")
print("  5. LoRA (parameter-efficient finetuning)")

print("\nâœ“ Models saved:")
print("  - /content/drive/MyDrive/llm_models/gpt2_tinystories")
print("  - /content/drive/MyDrive/llm_models/gpt2_lora_adapters")

print("\nðŸ“š Next steps:")
print("  - Notebook 06: LoRA on larger models (Llama, Phi-2)")
print("  - Notebook 07: QLoRA on Mistral-7B (4-bit quantization)")
print("  - Notebook 09: Production instruction tuning")
print("  - Notebook 14: Model comparison and evaluation")

print("\n" + "=" * 60)
print("Great job! You're now ready for advanced finetuning!")
print("=" * 60)