# Colab LoRA Fine-Tuning Tool

This notebook fine-tunes open-source chat models (e.g., `meta-llama/Llama-2-7b-chat-hf`, `mistralai/Mistral-7B-Instruct`) using standard LoRA adapters without 4-bit quantization. It targets users who have at least a single 16–24 GB GPU available in Google Colab and prefer a simpler, full-precision setup compared to QLoRA.


## Runtime checklist

1. Go to **Runtime ▸ Change runtime type** and choose `T4` (free) or `L4/A100` (Colab Pro/Pro+).
2. Ensure the **Hardware accelerator** is set to GPU.
3. After connecting, run the cell below to confirm that the GPU is visible to PyTorch.

> Full-precision LoRA consumes more VRAM than QLoRA. If you only have ~16 GB, prefer 7B models with modest batch sizes.


In [None]:
!nvidia-smi


In [None]:
%%capture
%pip install -U accelerate==0.30.1 datasets==2.19.1 evaluate==0.4.2 huggingface_hub==0.24.5 peft==0.11.1 sentencepiece==0.1.99 transformers==4.44.2 trl==0.9.4 wandb


In [None]:
from huggingface_hub import login

login(token=input("Paste your Hugging Face access token: ").strip(), add_to_git_credential=True)


## Data ingestion options

- **Upload `.txt`/`.json` files** to Colab and point `cfg.text_folder` to `/content/data`.
- **Mount Google Drive** for larger corpora:
  ```python
  from google.colab import drive
  drive.mount('/content/drive')
  ```
- **Reference a Hugging Face dataset** (e.g., `tatsu-lab/alpaca`) by setting `cfg.dataset_source="hf_dataset"`.

The helper below normalizes data into `instruction`, `response`, and optional `system` fields before training.


In [None]:
from dataclasses import asdict, dataclass
from pathlib import Path


@dataclass
class Config:
    project_name: str = "lora-full-precision"
    base_model: str = "meta-llama/Llama-2-7b-chat-hf"
    dataset_source: str = "text_folder"  # text_folder | hf_dataset
    text_folder: str = "/content/data"
    hf_dataset: str | None = None
    max_samples: int | None = None
    chunk_tokens: int = 1024
    chunk_overlap: int = 128
    system_prompt: str = "You are a helpful assistant."
    output_dir: str = "/content/lora-output"
    wandb_project: str | None = None

    micro_batch_size: int = 1
    gradient_accumulation_steps: int = 8
    epochs: float = 3.0
    learning_rate: float = 1e-4
    warmup_ratio: float = 0.03
    weight_decay: float = 0.0
    cutoff_len: int = 2048
    lora_r: int = 16
    lora_alpha: int = 32
    lora_dropout: float = 0.05
    seed: int = 42


cfg = Config()
print(asdict(cfg))


In [None]:
import random
import re

import pandas as pd
from datasets import Dataset, load_dataset


def _normalize(text: str) -> str:
    text = text.replace("\r", " ").strip()
    return re.sub(r"\s+", " ", text)


def _chunk_words(text: str, chunk_tokens: int, overlap: int) -> list[str]:
    words = text.split()
    if not words:
        return []
    step = max(chunk_tokens - overlap, 1)
    chunks = []
    for start in range(0, len(words), step):
        segment = words[start:start + chunk_tokens]
        if len(segment) < 32:
            continue
        chunks.append(" ".join(segment))
    return chunks or [" ".join(words[:chunk_tokens])]


def _load_local_texts(folder: str, cfg: Config) -> list[dict]:
    folder_path = Path(folder)
    rows = []
    for path in folder_path.rglob("*.txt"):
        text = path.read_text(encoding="utf-8")
        for chunk in _chunk_words(_normalize(text), cfg.chunk_tokens, cfg.chunk_overlap):
            rows.append({
                "instruction": f"Answer using {path.stem} context.",
                "response": chunk,
                "system": cfg.system_prompt,
            })
    return rows


def _load_hf_dataset(repo_id: str, cfg: Config) -> list[dict]:
    ds = load_dataset(repo_id, split="train")
    rows = []
    for row in ds:
        if {"instruction", "output"}.issubset(ds.column_names):
            instruction = row["instruction"]
            response = row["output"]
        else:
            instruction = row.get("instruction", "Summarize the passage:")
            response = row.get("text", row.get("response", ""))
        rows.append({
            "instruction": _normalize(str(instruction)),
            "response": _normalize(str(response)),
            "system": row.get("system", cfg.system_prompt),
        })
    return rows


def build_dataset(cfg: Config) -> Dataset:
    if cfg.dataset_source == "hf_dataset" and cfg.hf_dataset:
        rows = _load_hf_dataset(cfg.hf_dataset, cfg)
    else:
        rows = _load_local_texts(cfg.text_folder, cfg)

    if cfg.max_samples:
        random.seed(cfg.seed)
        rows = random.sample(rows, min(cfg.max_samples, len(rows)))

    rows = [r for r in rows if r["instruction"].strip() and r["response"].strip()]
    dataset = Dataset.from_pandas(pd.DataFrame(rows))
    print(f"Dataset has {len(dataset)} rows after cleaning")
    return dataset


dataset = build_dataset(cfg)
dataset[:2]


## Prompt templating

We build chat prompts using the tokenizer's `chat_template` when available so that LoRA adapters stay aligned with the base instruction format.


In [None]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(cfg.base_model, use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


def build_prompt(example: dict) -> dict:
    messages = [
        {"role": "system", "content": example.get("system") or cfg.system_prompt},
        {"role": "user", "content": example["instruction"]},
        {"role": "assistant", "content": example["response"]},
    ]
    if tokenizer.chat_template:
        prompt = tokenizer.apply_chat_template(messages, tokenize=False)
    else:
        prompt = (
            f"[SYSTEM]\n{messages[0]['content']}\n\n"
            f"[USER]\n{messages[1]['content']}\n\n"
            f"[ASSISTANT]\n{messages[2]['content']}"
        )
    return {"text": prompt}


processed_dataset = dataset.map(build_prompt, remove_columns=dataset.column_names)
processed_dataset = processed_dataset.shuffle(seed=cfg.seed)
splits = processed_dataset.train_test_split(test_size=0.05, seed=cfg.seed)
splits


## LoRA training

Without quantization, we load the base model in bf16/fp16, attach LoRA adapters to attention/MLP modules, and fine-tune via `trl.SFTTrainer`. Keep `micro_batch_size` low to stay within VRAM limits.


In [None]:
import math
import os

import torch
from peft import LoraConfig
from transformers import AutoModelForCausalLM, TrainingArguments
from trl import SFTTrainer

os.makedirs(cfg.output_dir, exist_ok=True)
if cfg.wandb_project:
    os.environ["WANDB_PROJECT"] = cfg.wandb_project
else:
    os.environ["WANDB_DISABLED"] = "true"

torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
model = AutoModelForCausalLM.from_pretrained(
    cfg.base_model,
    torch_dtype=torch_dtype,
    device_map="auto",
)
model.gradient_checkpointing_enable()
model.config.use_cache = False

peft_config = LoraConfig(
    r=cfg.lora_r,
    lora_alpha=cfg.lora_alpha,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=cfg.lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)

training_args = TrainingArguments(
    output_dir=cfg.output_dir,
    per_device_train_batch_size=cfg.micro_batch_size,
    gradient_accumulation_steps=cfg.gradient_accumulation_steps,
    num_train_epochs=cfg.epochs,
    learning_rate=cfg.learning_rate,
    warmup_ratio=cfg.warmup_ratio,
    weight_decay=cfg.weight_decay,
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
    bf16=torch_dtype == torch.bfloat16,
    fp16=torch_dtype == torch.float16,
    max_grad_norm=0.3,
    report_to=([] if os.environ.get("WANDB_DISABLED") == "true" else ["wandb"]),
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    peft_config=peft_config,
    train_dataset=splits["train"],
    eval_dataset=splits["test"],
    dataset_text_field="text",
    max_seq_length=cfg.cutoff_len,
    packing=True,
    args=training_args,
)

trainer.train()
trainer.save_model(cfg.output_dir)
tokenizer.save_pretrained(cfg.output_dir)


In [None]:
from torch.utils.data import DataLoader


def compute_perplexity(eval_dataset, max_batches: int = 32) -> float:
    model.eval()
    loader = DataLoader(eval_dataset["text"], batch_size=1)
    scores = []
    for idx, batch in enumerate(loader):
        if idx >= max_batches:
            break
        encoded = tokenizer(batch[0], return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model(**encoded, labels=encoded["input_ids"])
        scores.append(math.exp(outputs.loss.item()))
    return sum(scores) / len(scores)


perplexity = compute_perplexity(splits["test"], max_batches=32)
print(f"Approximate perplexity: {perplexity:.2f}")


In [None]:
def chat(prompt: str, system: str | None = None, max_new_tokens: int = 512) -> str:
    messages = [
        {"role": "system", "content": system or cfg.system_prompt},
        {"role": "user", "content": prompt},
    ]
    template = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(template, return_tensors="pt").to(model.device)
    with torch.no_grad():
        generated = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
        )
    output_tokens = generated[0][inputs["input_ids"].shape[-1]:]
    return tokenizer.decode(output_tokens, skip_special_tokens=True)


chat("Summarize the three biggest takeaways from our training data.")


In [None]:
from peft import AutoPeftModelForCausalLM

MERGED_DIR = Path(cfg.output_dir) / "merged"
MERGED_DIR.mkdir(parents=True, exist_ok=True)

merged_model = AutoPeftModelForCausalLM.from_pretrained(cfg.output_dir, device_map="auto")
merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained(MERGED_DIR, safe_serialization=True)
tokenizer.save_pretrained(MERGED_DIR)
print("Merged checkpoint saved to", MERGED_DIR)

push_to_hub = False
if push_to_hub:
    repo_id = input("Target HF repo (username/project): ")
    merged_model.push_to_hub(repo_id, private=True)
    tokenizer.push_to_hub(repo_id, private=True)


## Next steps

- Adjust the config block (`cfg`) for your dataset, model, and hyperparameters before running.
- Monitor GPU memory via `nvidia-smi` while training; reduce `micro_batch_size` or `cutoff_len` if you hit OOM.
- Evaluate the resulting adapter on task-specific prompts and log findings (W&B, TensorBoard, etc.).
- Deploy adapters in production by loading them with `PeftModel.from_pretrained` inside your inference stack (vLLM, TGI, etc.), or use the merged checkpoint saved above.
