<a href="https://colab.research.google.com/github/DataSavvyYT/AI-engineering-course/blob/main/06_fine_tuning_llm/1_quantize_gemma_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Install libraries
!pip install -q transformers torch bitsandbytes accelerate huggingface_hub

In [None]:
import torch
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login

In [None]:
# 2. Login to Hugging Face (Required for Gemma)
# Enter your token when prompted (get it from huggingface.co/settings/tokens)
login()

In [None]:
model_id = "google/gemma-2-2b-it"

def clean_memory():
    gc.collect()
    torch.cuda.empty_cache()

print("\n" + "="*50)
print("PHASE 1: Standard Model (Float16) - Memory Check")
print("="*50)

In [None]:
# Load Standard Model (Float16)
print("Loading Standard Model...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

# Measure Size
mem_fp16 = model.get_memory_footprint() / 1024**3
print(f"Standard Memory Footprint: {mem_fp16:.2f} GB")

In [None]:
print("\nTesting Model Intelligence...")
question = "Explain why the sky is blue in one simple sentence."
print(f"Question: {question}\n")

# Format prompt for Gemma
messages = [{"role": "user", "content": question}]
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")

# Generate Answer
outputs = model.generate(input_ids, max_new_tokens=50)
response = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)

print(f"Answer: {response}")
print("\n(Note: this is non quantized model!)")

In [None]:
# Delete standard model to free up space
del model
clean_memory()
print("Standard model deleted from memory.")

print("\n" + "="*50)
print("PHASE 2: Quantized Model (4-bit) - Memory & Quality Check")
print("="*50)

In [None]:
# Configure 4-bit Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Load Quantized Model
print("Loading Quantized Model...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model_q = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

In [None]:
# Measure Size
mem_4bit = model_q.get_memory_footprint() / 1024**3
print(f"Quantized Memory Footprint: {mem_4bit:.2f} GB")

print("-" * 30)
print(f"MEMORY SAVINGS: {mem_fp16 / mem_4bit:.1f}x smaller")
print("-" * 30)

In [None]:
# ==========================================
# PHASE 3: Test Quality (Inference)
# ==========================================
print("\nTesting Model Intelligence...")
question = "Explain why the sky is blue in one simple sentence."
print(f"Question: {question}\n")

# Format prompt for Gemma
messages = [{"role": "user", "content": question}]
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")

# Generate Answer
outputs = model_q.generate(input_ids, max_new_tokens=50)
response = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)

print(f"Answer: {response}")
print("\n(Note: If the answer is coherent, accuracy was preserved!)")