<a href="https://colab.research.google.com/github/Agatsyadav2003/quantized-llm-mobile/blob/main/Quantization_nf4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Llama 3.2 3B Quantization for Android
# This notebook demonstrates how to quantize the Llama 3.2 3B model
# for deployment on Android devices.

import os
import torch
import time
import psutil
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from huggingface_hub import snapshot_download
import numpy as np
from pathlib import Path


# Function to calculate model size on disk
def get_model_size(model_path):
    """Calculate the size of the model on disk."""
    total_size = 0
    for path in Path(model_path).glob('**/*'):
        if path.is_file():
            total_size += path.stat().st_size
    return total_size / (1024**3)  # Convert to GB

# Create output directory
os.makedirs("quantized_model", exist_ok=True)

In [None]:
# 1. Download the original model
print("Downloading the original Llama 3.2 3B model...")
model_id = "meta-llama/Llama-3.2-3B"

# First, let's download just the tokenizer to check its size
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained("original_model")
orig_tokenizer_size = get_model_size("original_model")
print(f"Original tokenizer size: {orig_tokenizer_size:.2f} GB")



# Load the model with BF16 precision to save memory during download
print("\nLoading the model with BF16 precision...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)


# Save the original model to measure its size
print("\nSaving the original model...")
model.save_pretrained("original_model")
orig_model_size = get_model_size("original_model")
print(f"Original model size: {orig_model_size:.2f} GB")

# Free up memory
del model
torch.cuda.empty_cache()
gc.collect()

Downloading the original Llama 3.2 3B model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Original tokenizer size: 6.00 GB

Loading the model with BF16 precision...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Saving the original model...
Original model size: 6.00 GB


479

In [None]:
!pip install --upgrade bitsandbytes
!pip install --upgrade accelerate transformers

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

Collecting accelerate
  Downloading accelerate-1.5.2-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Downloading accelerate-1.5.2-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m119.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers, accelerate
  Attempting uninstall: transformers
    Found existing installation: transformers 4.48.3
    Uninstalling transformers-4.48.3:
^C


In [None]:
# 2. Quantize the model to 4-bit
print("\nQuantizing the model to 4-bit...")
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load the model with 4-bit quantization
model_4bit = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"
)


Quantizing the model to 4-bit...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# 3. Export the quantized model
print("\nExporting the quantized model...")
model_4bit.save_pretrained("quantized_model/4bit")
tokenizer.save_pretrained("quantized_model/4bit")

# Calculate the size of the quantized model
quant_model_size = get_model_size("quantized_model/4bit")
print(f"Quantized model size: {quant_model_size:.2f} GB")
print(f"Size reduction: {(1 - quant_model_size / orig_model_size) * 100:.2f}%")


Exporting the quantized model...
Quantized model size: 2.10 GB
Size reduction: 64.92%


In [None]:
# 4. Validate the quantized model with a quick inference test
print("\nValidating the quantized model with a quick inference test...")
input_text = "Hello, my name is"
input_ids = tokenizer(input_text, return_tensors="pt").to(model_4bit.device)

# Measure inference time
start_time = time.time()
with torch.no_grad():
    output = model_4bit.generate(
        **input_ids,
        max_new_tokens=20,
        temperature=0.7,
        top_p=0.9
    )
end_time = time.time()

decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Input: {input_text}")
print(f"Output: {decoded_output}")
print(f"Inference time: {end_time - start_time:.2f} seconds")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Validating the quantized model with a quick inference test...
Input: Hello, my name is
Output: Hello, my name is Peter. I'm a 35-year-old computer geek from the United States. I've been programming
Inference time: 2.21 seconds


In [None]:
!ls /content/llama.cpp/build/bin/

libggml-base.so		       llama-llava-clip-quantize-cli  llama-vdot
libggml-cpu.so		       llama-lookahead		      test-arg-parser
libggml.so		       llama-lookup		      test-autorelease
libllama.so		       llama-lookup-create	      test-backend-ops
libllava_shared.so	       llama-lookup-merge	      test-barrier
llama-batched		       llama-lookup-stats	      test-c
llama-batched-bench	       llama-minicpmv-cli	      test-chat
llama-bench		       llama-parallel		      test-chat-template
llama-cli		       llama-passkey		      test-gguf
llama-convert-llama2c-to-ggml  llama-perplexity		      test-grammar-integration
llama-cvector-generator        llama-q8dot		      test-grammar-parser
llama-embedding		       llama-quantize		      test-json-schema-to-grammar
llama-eval-callback	       llama-quantize-stats	      test-llama-grammar
llama-export-lora	       llama-qwen2vl-cli	      test-log
llama-gbnf-validator	       llama-retrieval		      test-model-load-cancel
llama-gemma3-cli	       llama-run	

In [None]:
# 5. Export the model for GGUF format (compatible with llama.cpp)
print("\nExporting the model to GGUF format...")

# Install necessary dependencies
!pip install -q sentencepiece cmake

# Clone the llama.cpp repository
!git clone https://github.com/ggerganov/llama.cpp
%cd llama.cpp
!mkdir build && cd build && cmake .. && make -j
%cd ..

# Convert the model to GGUF format using the updated script
!python llama.cpp/convert_hf_to_gguf.py --outfile "quantized_model/llama_3.2_3b.gguf" "original_model"

# Quantize the GGUF model to 4-bit using the updated quantization tool
!llama.cpp/build/bin/llama-quantize "quantized_model/llama_3.2_3b.gguf" "quantized_model/llama_3.2_3b_q4_k_m.gguf" q4_k_m

# Check the size of the final quantized model
gguf_model_size = os.path.getsize("quantized_model/llama_3.2_3b_q4_k_m.gguf") / (1024**3)
print(f"GGUF 4-bit quantized model size: {gguf_model_size:.2f} GB")
print(f"Size reduction compared to original: {(1 - gguf_model_size / orig_model_size) * 100:.2f}%")


Exporting the model to GGUF format...
[0mfatal: destination path 'llama.cpp' already exists and is not an empty directory.
/content/llama.cpp
mkdir: cannot create directory ‘build’: File exists
/content
INFO:hf-to-gguf:Loading model: original_model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00002.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> F16, shape = {3072, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.bfloat16 --> F16, shape = {8192, 3072}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.bfloat16 --> F16, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torc

In [None]:
# 6. Create a ZIP file for easy download
print("\nCreating a ZIP file for easy download...")
!zip -r quantized_model.zip quantized_model/4bit quantized_model/llama_3.2_3b_q4_k_m.gguf

print("\nModel quantization complete!")
print("Download the quantized_model.zip file to use in your Android app.")

# Summary of model sizes
print("\nModel size summary:")
print(f"Original model size: {orig_model_size:.2f} GB")
print(f"4-bit quantized model size (transformers): {quant_model_size:.2f} GB")
print(f"4-bit quantized model size (GGUF): {gguf_model_size:.2f} GB")


Creating a ZIP file for easy download...
  adding: quantized_model/4bit/ (stored 0%)
  adding: quantized_model/4bit/model.safetensors (deflated 10%)
  adding: quantized_model/4bit/tokenizer_config.json (deflated 96%)
  adding: quantized_model/4bit/config.json (deflated 56%)
  adding: quantized_model/4bit/generation_config.json (deflated 32%)
  adding: quantized_model/4bit/special_tokens_map.json (deflated 62%)
  adding: quantized_model/4bit/tokenizer.json (deflated 85%)
  adding: quantized_model/llama_3.2_3b_q4_k_m.gguf (deflated 2%)

Model quantization complete!
Download the quantized_model.zip file to use in your Android app.

Model size summary:
Original model size: 6.00 GB
4-bit quantized model size (transformers): 2.10 GB
4-bit quantized model size (GGUF): 1.88 GB


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Move the ZIP file to your Google Drive
!cp quantized_model.zip /content/drive/MyDrive/


Mounted at /content/drive
