<a href="https://colab.research.google.com/github/DataSavvyYT/AI-engineering-course/blob/main/7_fine_tuning_llm/00_qlora_with_training_and_HF_save.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Run this cell first to install required libraries
!pip install -q -U torch bitsandbytes transformers peft accelerate
!pip install -q datasets trl

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from huggingface_hub import login

In [None]:
from google.colab import userdata
HF_TOKEN=userdata.get('HF_TOKEN')

In [None]:
# 1. Login to Hugging Face (Required for Gemma)
# Replace 'YOUR_HF_TOKEN' with your actual token
login(token=HF_TOKEN)


In [None]:
# 2. Configuration for 4-bit Quantization (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,              # Enable 4-bit loading
    bnb_4bit_quant_type="nf4",      # Normalized Float 4 (optimized for LLMs)
    bnb_4bit_compute_dtype=torch.float16, # Compute in float16 for speed
    bnb_4bit_use_double_quant=True, # Quantize the quantization constants
)

In [None]:
# 3. Load the Model & Tokenizer
model_id = "google/gemma-2b-it"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto" # Automatically maps to T4 GPU
)

In [None]:
# 4. Prepare model for QLoRA training
# This freezes base weights and prepares layers for low-bit training
model = prepare_model_for_kbit_training(model)


In [None]:
# 5. Define LoRA Config
# Gemma target modules usually include q_proj, k_proj, v_proj, o_proj, etc.
peft_config = LoraConfig(
    r=8,                            # Rank (lower = faster, less memory)
    lora_alpha=16,                  # Alpha scaling
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
# 6. Apply LoRA Adapters
model = get_peft_model(model, peft_config)

In [None]:
# 7. Verify Trainable Parameters
model.print_trainable_parameters()

In [None]:
# Test Inference (sanity check)
input_text = "Explain quantum physics in one sentence."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

In [None]:
outputs = model.generate(**input_ids, max_new_tokens=50)
print("\nOutput:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
from datasets import load_dataset

In [None]:
# 7. Load a Sample Dataset
# We use a small quote dataset for a quick demo.
# In a real scenario, this would be your custom data.
dataset = load_dataset("Abirate/english_quotes")

In [None]:
# Set Training Arguments
# We use SFTConfig (which inherits from TrainingArguments) to pass SFT-specific params
training_args = SFTConfig(
    output_dir="./gemma-finetuned",
    dataset_text_field="quote",     # Moved here (Fixes the TypeError)
    #max_seq_length=512,             # Moved here
    per_device_train_batch_size=2,  # Keep low for Colab T4
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch
    optim="paged_adamw_8bit",       # Memory efficient optimizer
    logging_steps=5,                # Log loss often
    learning_rate=2e-4,
    max_steps=50,                   # Small step count for demo (approx 2-5 mins)
    fp16=True,                      # Use mixed precision
    save_strategy="no",             # Don't save checkpoints during this quick demo
    report_to="none"                # Disable wandb logging for simplicity
)

In [None]:
# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    args=training_args,
    peft_config=peft_config,
)

In [None]:
# Start Training
print("Starting training... Watch the loss decrease!")
trainer.train()

In [None]:
# ==========================================
# 4. INFERENCE CHECK (AFTER TRAINING)
# ==========================================
# Test the model after training
input_text = "Ask not what your country"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_new_tokens=50)
print("\nOutput after training:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
# ==========================================
# 5. PUSH TO HUGGING FACE HUB
# ==========================================
# This will create a repository in your Hugging Face account and upload the adapters.
# Since we used QLoRA, this upload will be very small (only the adapter weights, ~10-100MB).

repo_name = "gemma-2b-qlora-finetuned" # You can change this name

print(f"\nPushing adapters to Hugging Face: {repo_name}...")
try:
    model.push_to_hub(repo_name)
    tokenizer.push_to_hub(repo_name)
    print(f"Success! Model pushed to https://huggingface.co/YOUR_USERNAME/{repo_name}")
except Exception as e:
    print(f"Error pushing to hub: {e}")