## PTQ (Post Training Quantization)

### Dynamic PTQ

In this type of quantization, we manually perform the weight quantization after the training of the model without any retraining or without any calibration.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
X,y = make_moons(n_samples=5000,noise=0.5,random_state=42)

In [3]:
X

array([[ 0.64527536,  1.38251014],
       [ 0.14514823, -0.32157033],
       [ 0.11945131,  0.41631146],
       ...,
       [ 0.6360473 ,  0.66530771],
       [ 1.61542641, -0.24249711],
       [ 0.10599548,  1.0899585 ]])

In [4]:
X.shape

(5000, 2)

In [5]:
X = StandardScaler().fit_transform(X)

In [6]:
X = torch.tensor(X, dtype=torch.float32)

In [7]:
y = torch.tensor(y.reshape(-1,1), dtype=torch.float32)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

### Creating the MLP Model with 6 hidden layers

In [9]:
class MLP(nn.Module):
  def __init__(self):
    super().__init__()

    self.fc1 = nn.Linear(2,128)
    self.fc2 = nn.Linear(128,64)
    self.fc3 = nn.Linear(64,64)
    self.fc4 = nn.Linear(64,32)
    self.fc5 = nn.Linear(32,16)
    self.fc6 = nn.Linear(16,8)
    self.fc7 = nn.Linear(8,1)

  def forward(self,x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.relu(self.fc3(x))
    x = F.relu(self.fc4(x))
    x = F.relu(self.fc5(x))
    x = F.relu(self.fc6(x))
    return torch.sigmoid(self.fc7(x))

In [10]:
model_fp32 = MLP()

In [11]:
model_fp32.parameters()

<generator object Module.parameters at 0x7a1a6f7b7220>

In [12]:
optimizer = torch.optim.Adam(model_fp32.parameters(), lr=0.01)

In [13]:
loss_fn = nn.BCELoss()

In [14]:
for epoch in range(2000):
  model_fp32.train()
  optimizer.zero_grad()  # Initializing the gradient.
  out = model_fp32(X_train)
  loss = loss_fn(out, y_train)
  loss.backward()
  optimizer.step()
  if epoch % 500 == 0:
    print(f"Epoch: {epoch} Loss: {loss}")

Epoch: 0 Loss: 0.6934294700622559
Epoch: 500 Loss: 0.37679675221443176
Epoch: 1000 Loss: 0.3685001730918884
Epoch: 1500 Loss: 0.35359877347946167


In [15]:
def accuracy(model, X, y):
  model.eval()
  with torch.no_grad():
    preds = model(X)
    preds = (preds > 0.5).float()
    return (preds == y).float().mean().item()

In [16]:
print("\nFP32 Accracy = ", accuracy(model_fp32, X_test, y_test))


FP32 Accracy =  0.8119999766349792


### Dynamic Quanitzation

In [17]:
from torch.quantization import quantize_dynamic

model_int8 = quantize_dynamic(
    model_fp32,
    {nn.Linear},
    dtype = torch.qint8
)

For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  model_int8 = quantize_dynamic(


In [18]:
print("INT8 Quantized Model Accuracy: ", accuracy(model_int8, X_test, y_test))

INT8 Quantized Model Accuracy:  0.8069999814033508


#### Saving the Models

In [19]:
import os
torch.save(model_fp32.state_dict(), "model_fp32.pt")
torch.save(model_int8.state_dict(), "model_int8.pt")

#### Comparing the sizes of both the models

In [20]:
print("FP32 Model Size: ", os.path.getsize("model_fp32.pt") / 1e6, "MB")
print("INT8 Model Size: ", os.path.getsize("model_int8.pt") / 1e6, "MB")

FP32 Model Size:  0.067317 MB
INT8 Model Size:  0.026101 MB


### Manual Quantization Logic

In [25]:
def quantize_tensor(t, num_bits = 8):
  qmin = -2**(num_bits - 1)
  qmax = 2**(num_bits - 1) - 1
  min_val, max_val = t.min(), t.max()
  scale = (max_val - min_val) / float(qmax - qmin + 1e-8)
  zp = torch.round(-min_val / scale).to(torch.int32)
  q_t = torch.clamp(torch.round(t / scale) + zp, qmin, qmax).to(torch.int32)
  return q_t, scale, zp

### Manual Dequantization Logic

In [26]:
def dequantize_tensor(q_t, scale, zp):
  return (q_t.float() - zp) * scale

In [27]:
new_model = MLP()

In [28]:
for name, param in new_model.named_parameters():
  print(name)
  print(param.shape)

fc1.weight
torch.Size([128, 2])
fc1.bias
torch.Size([128])
fc2.weight
torch.Size([64, 128])
fc2.bias
torch.Size([64])
fc3.weight
torch.Size([64, 64])
fc3.bias
torch.Size([64])
fc4.weight
torch.Size([32, 64])
fc4.bias
torch.Size([32])
fc5.weight
torch.Size([16, 32])
fc5.bias
torch.Size([16])
fc6.weight
torch.Size([8, 16])
fc6.bias
torch.Size([8])
fc7.weight
torch.Size([1, 8])
fc7.bias
torch.Size([1])


In [29]:
# torch.no_grad() -- This means that training is not required and we can simply proceed with quanitzation of the trained weights.

with torch.no_grad():
  for (name_fp, param_fp), (name_q, param_q) in zip(model_fp32.named_parameters(), new_model.named_parameters()):
    q_param, scale, zp = quantize_tensor(param_fp.data)
    dq_param = dequantize_tensor(q_param, scale, zp)
    param_q.data = dq_param

In [31]:
print("\nINT8 Accuracy: ", accuracy(new_model, X_test, y_test))


INT8 Accuracy:  0.47600001096725464
