# Quantization Notebook
This notebook demonstrates post-training dynamic quantization of a Transformer model using PyTorch.

In [None]:
import os
import torch
import torch.nn as nn
from transformers import DistilBertModel, DistilBertTokenizer

# Load model
model_name = "distilbert-base-uncased"
model = DistilBertModel.from_pretrained(model_name)
model.eval()

# Function to get model size
def get_model_size_mb(model):
    torch.save(model.state_dict(), "temp.pth")
    size_mb = os.path.getsize("temp.pth") / (1024*1024)
    os.remove("temp.pth")
    return size_mb

print(f"Model size (FP32): {get_model_size_mb(model):.2f} MB")

# Dynamic quantization
quantized_model = torch.quantization.quantize_dynamic(
    model,
    {nn.Linear},
    dtype=torch.qint8
)
print(f"Model size (INT8): {get_model_size_mb(quantized_model):.2f} MB")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model size (FP32): 253.18 MB
Model size (INT8): 131.71 MB


## Inference Demonstration
Run inference to compare outputs of FP32 and quantized models:

In [2]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
inputs = tokenizer("Optimization is essential!", return_tensors="pt")

with torch.no_grad():
    output_fp32 = model(**inputs)
    output_int8 = quantized_model(**inputs)

print("FP32 output shape:", output_fp32.last_hidden_state.shape)
print("INT8 output shape:", output_int8.last_hidden_state.shape)

FP32 output shape: torch.Size([1, 6, 768])
INT8 output shape: torch.Size([1, 6, 768])
