<a href="https://colab.research.google.com/github/DataSavvyYT/AI-engineering-course/blob/main/06_fine_tuning_llm/1_quantize_gemma_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Install libraries
!pip install -q transformers torch bitsandbytes accelerate huggingface_hub

In [None]:
import torch
import gc
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login

In [None]:
# 2. Login to Hugging Face (Required for Gemma)
# Enter your token when prompted (get it from huggingface.co/settings/tokens)
login()

In [None]:
model_id = "google/gemma-2b"

def clean_memory():
    """Simple function to clear GPU memory"""
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
print("\n" + "="*40)
print("PHASE 1: Standard Model (Float16)")
print("="*40)

In [None]:
# Load Standard Model
# We use float16 because that is the standard for modern LLMs (not float32)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

In [None]:
# Measure Size
mem_fp16 = model.get_memory_footprint() / 1024**3 # Convert to GB
print(f"Standard Memory Footprint: {mem_fp16:.2f} GB")

In [None]:
# Clean up
del model
clean_memory()

print("\n" + "="*40)
print("PHASE 2: Quantized Model (4-bit)")
print("="*40)

In [None]:
# Configure 4-bit Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Load Quantized Model
model_q = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

In [None]:
# Measure Size
mem_4bit = model_q.get_memory_footprint() / 1024**3
print(f"Quantized Memory Footprint: {mem_4bit:.2f} GB")

# Show Comparison
print("-" * 30)
print(f"Memory Savings: {mem_fp16 / mem_4bit:.1f}x smaller")
print("-" * 30)