<a href="https://colab.research.google.com/github/DataSavvyYT/AI-engineering-course/blob/main/07_fine_tuning_llm/00_qlora_with_training_and_HF_save.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Run this cell first to install required libraries
!pip install -q -U torch bitsandbytes transformers peft accelerate
!pip install -q datasets trl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m110.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954.8 kB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.1/193.1 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [28]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from huggingface_hub import login

In [3]:
from google.colab import userdata
HF_TOKEN=userdata.get('HF_TOKEN')

In [4]:
# 1. Login to Hugging Face (Required for Gemma)
# Replace 'YOUR_HF_TOKEN' with your actual token
login(token=HF_TOKEN)


In [5]:
# 2. Configuration for 4-bit Quantization (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,              # Enable 4-bit loading
    bnb_4bit_quant_type="nf4",      # Normalized Float 4 (optimized for LLMs)
    bnb_4bit_compute_dtype=torch.float16, # Compute in float16 for speed
    bnb_4bit_use_double_quant=True, # Quantize the quantization constants
)

In [6]:
# 3. Load the Model & Tokenizer
model_id = "google/gemma-2b-it"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto" # Automatically maps to T4 GPU
)

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [7]:
# 4. Prepare model for QLoRA training
# This freezes base weights and prepares layers for low-bit training
model = prepare_model_for_kbit_training(model)


In [8]:
# 5. Define LoRA Config
# Gemma target modules usually include q_proj, k_proj, v_proj, o_proj, etc.
peft_config = LoraConfig(
    r=8,                            # Rank (lower = faster, less memory)
    lora_alpha=16,                  # Alpha scaling
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [9]:
# 6. Apply LoRA Adapters
model = get_peft_model(model, peft_config)

In [10]:
# 7. Verify Trainable Parameters
model.print_trainable_parameters()

trainable params: 1,843,200 || all params: 2,508,015,616 || trainable%: 0.0735


In [11]:
# Test Inference (sanity check)
input_text = "Explain quantum physics in one sentence."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

In [12]:
outputs = model.generate(**input_ids, max_new_tokens=50)
print("\nOutput:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))


Output:
 Explain quantum physics in one sentence.

Quantum physics is the study of the behavior of matter and energy at the atomic and subatomic levels, where particles exhibit both wave-like and particle-like behavior.


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/465.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m460.8/465.5 kB[0m [31m20.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.5/465.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [19]:
from datasets import load_dataset

In [20]:
# 7. Load a Sample Dataset
# We use a small quote dataset for a quick demo.
# In a real scenario, this would be your custom data.
dataset = load_dataset("Abirate/english_quotes")

README.md: 0.00B [00:00, ?B/s]

quotes.jsonl:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [30]:
# Set Training Arguments
# We use SFTConfig (which inherits from TrainingArguments) to pass SFT-specific params
training_args = SFTConfig(
    output_dir="./gemma-finetuned",
    dataset_text_field="quote",     # Moved here (Fixes the TypeError)
    #max_seq_length=512,             # Moved here
    per_device_train_batch_size=2,  # Keep low for Colab T4
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch
    optim="paged_adamw_8bit",       # Memory efficient optimizer
    logging_steps=5,                # Log loss often
    learning_rate=2e-4,
    max_steps=50,                   # Small step count for demo (approx 2-5 mins)
    fp16=True,                      # Use mixed precision
    save_strategy="no",             # Don't save checkpoints during this quick demo
    report_to="none"                # Disable wandb logging for simplicity
)

In [31]:
# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    args=training_args,
    peft_config=peft_config,
)



Tokenizing train dataset:   0%|          | 0/2508 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [32]:
# Start Training
print("Starting training... Watch the loss decrease!")
trainer.train()

Starting training... Watch the loss decrease!


  return fn(*args, **kwargs)


Step,Training Loss
5,5.4259
10,4.117
15,4.11
20,3.915
25,2.9832
30,2.922
35,2.8818
40,3.0966
45,3.0497
50,3.0785


TrainOutput(global_step=50, training_loss=3.557955951690674, metrics={'train_runtime': 88.5041, 'train_samples_per_second': 4.52, 'train_steps_per_second': 0.565, 'total_flos': 275492156399616.0, 'train_loss': 3.557955951690674, 'epoch': 0.1594896331738437})

In [33]:
# ==========================================
# 4. INFERENCE CHECK (AFTER TRAINING)
# ==========================================
# Test the model after training
input_text = "Ask not what your country"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_new_tokens=50)
print("\nOutput after training:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
Caching is incompatible with gradient checkpointing in GemmaDecoderLayer. Setting `past_key_values=None`.



Output after training:
 Ask not what your country can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can can


In [34]:
# ==========================================
# 5. PUSH TO HUGGING FACE HUB
# ==========================================
# This will create a repository in your Hugging Face account and upload the adapters.
# Since we used QLoRA, this upload will be very small (only the adapter weights, ~10-100MB).

repo_name = "gemma-2b-qlora-finetuned" # You can change this name

print(f"\nPushing adapters to Hugging Face: {repo_name}...")
try:
    model.push_to_hub(repo_name)
    tokenizer.push_to_hub(repo_name)
    print(f"Success! Model pushed to https://huggingface.co/YOUR_USERNAME/{repo_name}")
except Exception as e:
    print(f"Error pushing to hub: {e}")


Pushing adapters to Hugging Face: gemma-2b-qlora-finetuned...


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   8%|7         |  565kB / 7.39MB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...pned9j7vx/tokenizer.model: 100%|##########| 4.24MB / 4.24MB            

  ...mpned9j7vx/tokenizer.json:  97%|#########7| 33.4MB / 34.4MB            

Success! Model pushed to https://huggingface.co/YOUR_USERNAME/gemma-2b-qlora-finetuned
