In [1]:
!pip install -U "transformers>=4.42.0" "datasets>=2.18.0" "accelerate>=0.30.0" "bitsandbytes>=0.43.0" "peft>=0.11.1" "trl>=0.9.6" sentencepiece

Collecting transformers>=4.42.0
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
Collecting datasets>=2.18.0
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting accelerate>=0.30.0
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes>=0.43.0
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting peft>=0.11.1
  Downloading peft-0.17.1-py3-none-any.whl.metadata (14 kB)
Collecting trl>=0.9.6
  Downloading trl-0.22.2-py3-none-any.whl.metadata (11 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers>=4.42.0)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers>=4.42.0)
  Downloading regex-2025.9.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28

In [9]:
!pip install -q protobuf

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [7]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import os
import random
import torch
from dataclasses import dataclass
from typing import Dict, List, Optional

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [26]:
# --------------------------
# Config (edit to taste)
# --------------------------
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
DATASET_NAME = "yahma/alpaca-cleaned"
OUTPUT_DIR = "./mistral7b-instruct-alpaca-lora"
SEED = 42

MAX_SEQ_LEN = 1024                 # Reduce if you hit OOM; raise if you have VRAM headroom
TRAIN_BATCH_SIZE = 1               # per-device train batch size (QLoRA, single GPU)
GRADIENT_ACCUM_STEPS = 16
EVAL_BATCH_SIZE = 1
NUM_TRAIN_EPOCHS = 1.0             # Increase for better quality (2-3+)
LEARNING_RATE = 2e-4
LOGGING_STEPS = 20
SAVE_STEPS = 500
WARMUP_RATIO = 0.03

LORA_R = 1
LORA_ALPHA = 2
LORA_DROPOUT = 0.05
TARGET_MODULES = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj",
]  # typical for Mistral/LLaMA-class models

In [27]:
# Prompts for quick A/B comparison later
TEST_PROMPTS = [
    "Write a short, friendly email to thank a colleague for helping you debug a tricky issue.",
    "Explain the difference between supervised and unsupervised learning to a beginner.",
    "Give me three creative uses of a paperclip."
]


In [28]:
# --------------------------
# Utilities / Repro
# --------------------------
def set_seed(seed: int = 42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)

In [29]:
# --------------------------
# Load tokenizer & base model (4-bit for QLoRA)
# --------------------------
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
# Mistral models often have no pad token configured—align to eos to avoid warnings
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Preparing 4-bit config...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16,
)
print("Loading base model in 4-bit (QLoRA)...")

Loading tokenizer...
Preparing 4-bit config...
Loading base model in 4-bit (QLoRA)...


In [30]:

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16,
)
# Enable gradient checkpointing for memory savings
base_model.gradient_checkpointing_enable()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [37]:
# --------------------------
# Dataset: load & format
# --------------------------
print("Loading dataset:", DATASET_NAME)
ds = load_dataset(DATASET_NAME)

# alpaca-cleaned has fields: instruction, input, output
# We'll convert each row into a single string that contains a "User:" prompt and an "Assistant:" target.
# This keeps things simple for SFT; the model learns to continue from Assistant: ...
def format_sample(example: Dict) -> str:
    instr = example.get("instruction", "").strip()
    ipt = example.get("input", "").strip()
    output = example.get("output", "").strip()

    if ipt:
        user = f"{instr}\n\nInput:\n{ipt}"
    else:
        user = instr

    # Simple, consistent pattern:
    # User: <...>\nAssistant: <...>
    # During training, we feed "User: ...\nAssistant: {label tokens}" so the model learns the completion.
    prompt = f"User:\n{user}\n\nAssistant:\n{output}"
    return prompt

print("Formatting dataset...")
# train_dataset = ds["train"].shuffle(seed=SEED).map(
#     lambda x: {"text": format_sample(x)},
#     remove_columns=ds["train"].column_names,
# )

train_dataset = ds["train"].map(
    lambda x: {"text": format_sample(x)},
    remove_columns=ds["train"].column_names,
)

# Subsample dataset for quicker training (~30 min)
MAX_SAMPLES = 2000   # try 1000 if you want even faster
train_dataset = train_dataset.shuffle(seed=SEED).select(range(MAX_SAMPLES))

Loading dataset: yahma/alpaca-cleaned
Formatting dataset...


Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [38]:
# --------------------------
# LoRA config
# --------------------------
print("Setting up LoRA...")
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=TARGET_MODULES,
    task_type="CAUSAL_LM",
    bias="none",
)

Setting up LoRA...


In [39]:
# --------------------------
# Training Args
# --------------------------
print("Preparing training arguments...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUM_STEPS,
    learning_rate=LEARNING_RATE,
    logging_steps=LOGGING_STEPS,
    save_steps=SAVE_STEPS,
    save_total_limit=2,
    bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
    fp16=not (torch.cuda.is_available() and torch.cuda.is_bf16_supported()),
    optim="paged_adamw_8bit",   # paged optimizer is good with QLoRA
    lr_scheduler_type="cosine",
    warmup_ratio=WARMUP_RATIO,
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    report_to="none",
)

Preparing training arguments...


In [40]:
from transformers import DataCollatorForLanguageModeling

In [41]:
# --------------------------
# SFT Trainer
# --------------------------
print("Initializing SFTTrainer...")
def formatting_func(example):
    # We already stored the full prompt-response in "text"
    return example["text"]

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_dataset,
    formatting_func=formatting_func,  
    peft_config=peft_config,
    args=training_args,
    processing_class=tokenizer, 
    data_collator=data_collator
)

Initializing SFTTrainer...




Applying formatting function to train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [42]:
# --------------------------
# Train
# --------------------------
print("Starting training...")
trainer.train()

Starting training...


Step,Training Loss
20,1.1335
40,0.9996
60,0.9583
80,0.9986
100,0.931
120,0.9445


TrainOutput(global_step=125, training_loss=0.9924676818847656, metrics={'train_runtime': 959.2874, 'train_samples_per_second': 2.085, 'train_steps_per_second': 0.13, 'total_flos': 1.6037281592205312e+16, 'train_loss': 0.9924676818847656, 'entropy': 1.026659072190523, 'num_tokens': 375593.0, 'mean_token_accuracy': 0.719572489708662, 'epoch': 1.0})

In [43]:
# --------------------------
# Save LoRA adapter
# --------------------------
print("Saving LoRA adapter to:", OUTPUT_DIR)
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

Saving LoRA adapter to: ./mistral7b-instruct-alpaca-lora


('./mistral7b-instruct-alpaca-lora/tokenizer_config.json',
 './mistral7b-instruct-alpaca-lora/special_tokens_map.json',
 './mistral7b-instruct-alpaca-lora/chat_template.jinja',
 './mistral7b-instruct-alpaca-lora/tokenizer.model',
 './mistral7b-instruct-alpaca-lora/added_tokens.json',
 './mistral7b-instruct-alpaca-lora/tokenizer.json')

In [44]:
# --------------------------
# Inference comparison: Base vs Fine-tuned (LoRA)
# --------------------------
def build_chat_prompt(user_text: str) -> str:
    # Use the same simple pattern we trained with for consistency:
    return f"User:\n{user_text}\n\nAssistant:\n"

def generate_with_model(model, tok, prompt: str, max_new_tokens=256, temperature=0.7, top_p=0.9):
    input_ids = tok(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tok.eos_token_id,
        )
    text = tok.decode(out[0], skip_special_tokens=True)
    # Return only the assistant continuation after our prompt marker:
    if "Assistant:" in text:
        return text.split("Assistant:", 1)[1].strip()
    return text

def load_base_pipeline():
    # Fresh base model for a clean comparison (no adapters)
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16,
    )
    return model

def load_finetuned_pipeline():
    # Load base, then attach LoRA adapter we just trained
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16,
    )
    model = PeftModel.from_pretrained(model, OUTPUT_DIR)
    return model

In [45]:
print("\n--- Inference: Comparing Base vs Fine-tuned ---")
base_infer_model = load_base_pipeline()
ft_infer_model = load_finetuned_pipeline()

for i, uprompt in enumerate(TEST_PROMPTS, start=1):
    print(f"\n=== Prompt {i} ===")
    print(">> User:", uprompt)

    prompt = build_chat_prompt(uprompt)

    base_resp = generate_with_model(base_infer_model, tokenizer, prompt)
    print("\n[BASE MODEL RESPONSE]")
    print(base_resp)

    ft_resp = generate_with_model(ft_infer_model, tokenizer, prompt)
    print("\n[FINE-TUNED (LoRA) RESPONSE]")
    print(ft_resp)

print("\nDone.")


--- Inference: Comparing Base vs Fine-tuned ---


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


=== Prompt 1 ===
>> User: Write a short, friendly email to thank a colleague for helping you debug a tricky issue.

[BASE MODEL RESPONSE]
Subject: Heartfelt thanks for your assistance with the issue

Dear [Colleague's Name],

I hope this email finds you well. I wanted to take a moment to express my gratitude for the help you provided me with the debugging issue we encountered recently. Your patience, expertise, and willingness to work together made all the difference.

I must admit, I was quite stumped by the problem, and I couldn't have solved it without your assistance. The issue has been resolved, and I am truly grateful for your support.

In the future, if I can ever be of help to you, please don't hesitate to ask. I am always here to lend a hand.

Once again, thank you for your help. I look forward to working with you on our next project.

Best regards,

[Your Name]

[FINE-TUNED (LoRA) RESPONSE]
Subject: Thank you for your help debugging the issue

Dear [Colleague's name],

I wan