[Video link](https://www.youtube.com/watch?v=0VdNflU08yA&t=17s)

# Assymmetric VS Symmetric Quantization

## create a simple tensor to work with

In [1]:
import numpy as np

# suppress scientific notation and round to 2 decimal places
np.set_printoptions(suppress=2)

# Generate uniform distribution
params = np.random.uniform(low=-50, high=150, size=20)

# make important value at the beginning (for better debugging)
params[0] = params.max() + 1
params[1] = params.min() - 1
params[2] = 0


# print
print(params)

video_params = np.array(
    [43.31, -44.93, 0, 22.99, -43.93, -11.35, 38.48, -20.49, -38.61, -28.02]
)

[147.59776399 -48.74568247   0.         146.59776399 121.51449439
  96.0871144   74.92129314 -28.3979833  128.46395637 106.17554416
 142.6507787  -47.05849674 141.70347562 -11.01861359 123.20054291
 -40.89760992  84.60083393 139.80503778   2.6165592  -47.74568247]


## Define Quantization Functions

In [2]:
def clamp(arr, lower_bound, upper_bound):
    arr[arr < lower_bound] = lower_bound
    arr[arr > upper_bound] = upper_bound
    return arr

In [3]:
def assymmetric_quantization(params, nbits):
    alpha = np.max(params)
    beta = np.min(params)

    scale = (alpha - beta) / ((2**nbits) - 1)
    zero = -1 * np.round(beta / scale)

    quantized = clamp(np.round(params / scale + zero), 0, (2**nbits) - 1).astype(
        np.int32
    )

    return quantized, scale, zero


def assymmetric_dequantization(qparams, scale, zero):
    return scale * (qparams - zero)

In [4]:
print(f"Original params {params}")
quantized, scale, zero = assymmetric_quantization(params, 8)
print(f"Quantized params {quantized}")
dequantized = assymmetric_dequantization(quantized, scale, zero)
print(f"dequantized params {dequantized}")
print(f"MSE: {np.mean((params - dequantized) ** 2)}")

Original params [147.59776399 -48.74568247   0.         146.59776399 121.51449439
  96.0871144   74.92129314 -28.3979833  128.46395637 106.17554416
 142.6507787  -47.05849674 141.70347562 -11.01861359 123.20054291
 -40.89760992  84.60083393 139.80503778   2.6165592  -47.74568247]
Quantized params [255   0  63 253 221 188 160  26 230 201 248   2 247  49 223  10 173 245
  66   1]
dequantized params [147.83506557 -48.50838089   0.         146.29511697 121.65593937
  96.24678748  74.68750708 -28.48904909 128.58570807 106.25645338
 142.44524547 -46.96843229 141.67527117 -10.7796402  123.19588797
 -40.80863789  84.69717298 140.13532257   2.3099229  -47.73840659]
MSE: 0.03313425676780694


In [5]:
def symmetric_quantization(params, nbits):
    alpha = np.max(np.abs(params))
    scale = alpha / ((2 ** (nbits - 1)) - 1)

    lower_bound = -1 * (2 ** (nbits - 1))
    upper_bound = (2 ** (nbits - 1)) - 1

    quantized = clamp(np.round(params / scale), lower_bound, upper_bound).astype(
        np.int32
    )
    return quantized, scale


def symmetric_dequantization(qparams, scale):
    return scale * qparams

In [6]:
print(f"Original params {params}")
quantized, scale = symmetric_quantization(params, 8)
print(f"Quantized params {quantized}")
dequantized = symmetric_dequantization(quantized, scale)
print(f"dequantized params {dequantized}")
print(f"MSE: {np.mean((params - dequantized) ** 2)}")

Original params [147.59776399 -48.74568247   0.         146.59776399 121.51449439
  96.0871144   74.92129314 -28.3979833  128.46395637 106.17554416
 142.6507787  -47.05849674 141.70347562 -11.01861359 123.20054291
 -40.89760992  84.60083393 139.80503778   2.6165592  -47.74568247]
Quantized params [127 -42   0 126 105  83  64 -24 111  91 123 -40 122  -9 106 -35  73 120
   2 -41]
dequantized params [147.59776399 -48.81185896   0.         146.43557687 122.02964739
  96.4615308   74.37997555 -27.89249083 129.0027701  105.75902774
 142.94901552 -46.48748472 141.7868284  -10.45968406 123.19183451
 -40.67654913  84.83965962 139.46245416   2.32437424 -47.64967184]
MSE: 0.12504463234472618


### Choosing Alpha and Beta

In [7]:
# Generate uniform distribution
params = np.random.uniform(low=-50, high=150, size=10000)

# make important value at the beginning (for better debugging)
params[0] = params.max() + 1
params[1] = params.min() - 1
params[2] = 0

# set an outlier
params[-1] = 1000

In [8]:
def assymmetric_quantization_percentile(params, nbits, percentile=99.99):
    alpha = np.percentile(params, percentile)
    beta = np.percentile(params, 100 - percentile)

    scale = (alpha - beta) / ((2**nbits) - 1)
    zero = -1 * np.round(beta / scale)

    quantized = clamp(np.round(params / scale + zero), 0, (2**nbits) - 1).astype(
        np.int32
    )

    return quantized, scale, zero

In [9]:
quantized_minmax, scale_minmax, zero_minmax = assymmetric_quantization(params, 8)
dequantized_minmax = assymmetric_dequantization(
    quantized_minmax, scale_minmax, zero_minmax
)

quantized_percentile, scale_percentile, zero_percentile = (
    assymmetric_quantization_percentile(params, 8)
)
dequantized_percentile = assymmetric_dequantization(
    quantized_percentile, scale_percentile, zero_percentile
)

# the outlier suffers from big quantization error in percentile
print(f"Original values: {params}")
print(f"dequantized minmax: {dequantized_minmax}")
print(f"dequantized percentile: {dequantized_percentile}")

# So MSE is larger in percentile
print(f"MSE: {np.mean((params - dequantized_minmax) ** 2)}")
print(f"MSE: {np.mean((params - dequantized_percentile) ** 2)}")

# MSE execluding outlier
print(f"MSE (no outlier): {np.mean((params[:-1] - dequantized_minmax[:-1]) ** 2)}")
print(f"MSE (no outlier): {np.mean((params[:-1] - dequantized_percentile[:-1]) ** 2)}")

Original values: [ 150.99113313  -50.97723924    0.         ...   47.71223358  -46.29745736
 1000.        ]
dequantized minmax: [ 152.49473667  -49.45775243    0.         ...   49.45775243  -45.33627307
 1001.5194868 ]
dequantized percentile: [151.38136339 -49.67200986   0.         ...  48.09512066 -46.51823146
 151.38136339]
MSE: 1.423691734308575
MSE: 72.06796435785438
MSE (no outlier): 1.4236032106145997
MSE (no outlier): 0.052610579512310615


# Post Training Quantization

In [10]:
# imports
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.nn as nn

import matplotlib.pyplot as plt
from tqdm import tqdm
import os

## Prepare Dataset

In [36]:
# make torch deterministic
torch.manual_seed(0)

transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
)

# load mnist dataset
mnist_train = datasets.MNIST(
    root="./data/", train=True, transform=transform, download=True
)
train_loader = torch.utils.data.DataLoader(
    dataset=mnist_train, batch_size=10, shuffle=True
)
mnist_test = datasets.MNIST(
    root="./data/", train=False, transform=transform, download=True
)
test_loader = torch.utils.data.DataLoader(
    dataset=mnist_test, batch_size=10, shuffle=True
)

device = "cpu"

## Architecture, Train and Test

In [37]:
class VerySimplenet(nn.Module):
    def __init__(self, hidden_size_1=100, hidden_size_2=100):
        super().__init__()
        self.linear1 = nn.Linear(28 * 28, hidden_size_1)
        self.linear2 = nn.Linear(hidden_size_1, hidden_size_2)
        self.linear3 = nn.Linear(hidden_size_2, 10)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.linear3(x)
        return x

In [38]:
net = VerySimplenet().to(device)

In [39]:
def train(model, train_loader, epochs):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    total_iters = 0

    for epoch in range(epochs):
        for idx, (images, labels) in tqdm(enumerate(train_loader)):
            images, labels = images.to(device), labels.to(device)
            loss_sum = 0

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            loss_sum += loss.item()
            total_iters += 1

            if total_iters % 1000 == 0:
                print(f"Epoch: {epoch}, Iteration: {total_iters}, Loss: {loss_sum}")

    return model

In [40]:
net = train(net, train_loader, 1)

1013it [00:04, 169.16it/s]

Epoch: 0, Iteration: 1000, Loss: 0.9822236895561218


2029it [00:12, 152.21it/s]

Epoch: 0, Iteration: 2000, Loss: 0.33656615018844604


3016it [00:21, 132.34it/s]

Epoch: 0, Iteration: 3000, Loss: 0.6857504844665527


4020it [00:28, 157.99it/s]

Epoch: 0, Iteration: 4000, Loss: 0.03442932292819023


5019it [00:35, 190.92it/s]

Epoch: 0, Iteration: 5000, Loss: 0.14028537273406982


6000it [00:40, 147.53it/s]

Epoch: 0, Iteration: 6000, Loss: 0.01198307704180479





In [41]:
def test(model, test_loader):
    correct = 0
    total = 0

    model.eval()
    with torch.no_grad():
        for images, labels in tqdm(test_loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Accuracy: {100 * correct / total}")

## Weights and Model size (Before Quantization)

In [42]:
def print_model_size(model):
    torch.save(model.state_dict(), "model.pt")
    print(f"Model size: {os.path.getsize('model.pt') / 1024:.2f} KB")
    os.remove("model.pt")

In [43]:
print("Weights before quantization")
print(net.linear1.weight)
print(net.linear1.weight.dtype)

Weights before quantization
Parameter containing:
tensor([[-0.0127,  0.0067, -0.0419,  ...,  0.0095, -0.0087, -0.0104],
        [-0.0197, -0.0149, -0.0104,  ..., -0.0202, -0.0059, -0.0299],
        [ 0.0199,  0.0550,  0.0068,  ...,  0.0197,  0.0413,  0.0481],
        ...,
        [ 0.0278,  0.0316, -0.0031,  ..., -0.0084,  0.0108, -0.0261],
        [ 0.0056,  0.0137,  0.0458,  ...,  0.0261,  0.0261,  0.0256],
        [ 0.0102,  0.0049, -0.0093,  ...,  0.0271, -0.0221, -0.0020]],
       requires_grad=True)
torch.float32


In [44]:
print("Model size before quantization")
print_model_size(net)

Model size before quantization
Model size: 352.36 KB


In [45]:
print("Accuracy Before Quantization")
test(net, test_loader)

Accuracy Before Quantization


100%|██████████| 1000/1000 [00:03<00:00, 287.90it/s]

Accuracy: 96.06





## Insert MinMax Observers in the model

In [46]:
class QuantizedVerySimplenet(nn.Module):
    def __init__(self, hidden_size_1=100, hidden_size_2=100):
        super().__init__()
        self.quant = (
            torch.quantization.QuantStub()
        )  # Quantization function (will be replaced by quantization)
        self.linear1 = nn.Linear(28 * 28, hidden_size_1)
        self.linear2 = nn.Linear(hidden_size_1, hidden_size_2)
        self.linear3 = nn.Linear(hidden_size_2, 10)
        self.relu = nn.ReLU()
        self.dequant = (
            torch.quantization.DeQuantStub()
        )  # Dequantization function (will be replaced by dequantization)

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = self.quant(x)
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.linear3(x)
        x = self.dequant(x)
        return x

In [47]:
net_quantized = QuantizedVerySimplenet().to(device)
# Copy Weights from original model to quantized model
net_quantized.load_state_dict(net.state_dict())
net_quantized.eval()

net_quantized.qconfig = torch.quantization.default_qconfig
net_quantized = torch.quantization.prepare(net_quantized)  # Insert observers
net_quantized

QuantizedVerySimplenet(
  (quant): QuantStub(
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear1): Linear(
    in_features=784, out_features=100, bias=True
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear2): Linear(
    in_features=100, out_features=100, bias=True
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear3): Linear(
    in_features=100, out_features=10, bias=True
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (relu): ReLU()
  (dequant): DeQuantStub()
)

## Calibrate the model using Test test

In [48]:
test(net_quantized, test_loader)

100%|██████████| 1000/1000 [00:03<00:00, 277.10it/s]

Accuracy: 96.06





In [49]:
print("Check statistics of the various layers")
net_quantized

Check statistics of the various layers


QuantizedVerySimplenet(
  (quant): QuantStub(
    (activation_post_process): MinMaxObserver(min_val=-0.4242129623889923, max_val=2.821486711502075)
  )
  (linear1): Linear(
    in_features=784, out_features=100, bias=True
    (activation_post_process): MinMaxObserver(min_val=-53.42708206176758, max_val=38.368324279785156)
  )
  (linear2): Linear(
    in_features=100, out_features=100, bias=True
    (activation_post_process): MinMaxObserver(min_val=-29.53961753845215, max_val=25.241788864135742)
  )
  (linear3): Linear(
    in_features=100, out_features=10, bias=True
    (activation_post_process): MinMaxObserver(min_val=-25.761402130126953, max_val=22.6112003326416)
  )
  (relu): ReLU()
  (dequant): DeQuantStub()
)

## Quantize the model using statistics collected

In [50]:
net_quantized = torch.quantization.convert(net_quantized)  # Convert to quantized model
net_quantized

QuantizedVerySimplenet(
  (quant): Quantize(scale=tensor([0.0256]), zero_point=tensor([17]), dtype=torch.quint8)
  (linear1): QuantizedLinear(in_features=784, out_features=100, scale=0.7227985262870789, zero_point=74, qscheme=torch.per_tensor_affine)
  (linear2): QuantizedLinear(in_features=100, out_features=100, scale=0.43134966492652893, zero_point=68, qscheme=torch.per_tensor_affine)
  (linear3): QuantizedLinear(in_features=100, out_features=10, scale=0.38088664412498474, zero_point=68, qscheme=torch.per_tensor_affine)
  (relu): ReLU()
  (dequant): DeQuantize()
)

## Weights and Model size (After Quantization)

In [51]:
print("Weights after quantization")
print(torch.int_repr(net_quantized.linear1.weight()))

Weights after quantization
tensor([[-3,  1, -9,  ...,  2, -2, -2],
        [-4, -3, -2,  ..., -4, -1, -6],
        [ 4, 11,  1,  ...,  4,  8, 10],
        ...,
        [ 6,  6, -1,  ..., -2,  2, -5],
        [ 1,  3,  9,  ...,  5,  5,  5],
        [ 2,  1, -2,  ...,  6, -5,  0]], dtype=torch.int8)


In [52]:
# compare original weights and dequantized weights
print("Original weights")
print(net.linear1.weight)
print("Dequantized weights")
print(torch.dequantize(net_quantized.linear1.weight()))

Original weights
Parameter containing:
tensor([[-0.0127,  0.0067, -0.0419,  ...,  0.0095, -0.0087, -0.0104],
        [-0.0197, -0.0149, -0.0104,  ..., -0.0202, -0.0059, -0.0299],
        [ 0.0199,  0.0550,  0.0068,  ...,  0.0197,  0.0413,  0.0481],
        ...,
        [ 0.0278,  0.0316, -0.0031,  ..., -0.0084,  0.0108, -0.0261],
        [ 0.0056,  0.0137,  0.0458,  ...,  0.0261,  0.0261,  0.0256],
        [ 0.0102,  0.0049, -0.0093,  ...,  0.0271, -0.0221, -0.0020]],
       requires_grad=True)
Dequantized weights
tensor([[-0.0146,  0.0049, -0.0437,  ...,  0.0097, -0.0097, -0.0097],
        [-0.0194, -0.0146, -0.0097,  ..., -0.0194, -0.0049, -0.0291],
        [ 0.0194,  0.0534,  0.0049,  ...,  0.0194,  0.0388,  0.0486],
        ...,
        [ 0.0291,  0.0291, -0.0049,  ..., -0.0097,  0.0097, -0.0243],
        [ 0.0049,  0.0146,  0.0437,  ...,  0.0243,  0.0243,  0.0243],
        [ 0.0097,  0.0049, -0.0097,  ...,  0.0291, -0.0243,  0.0000]])


In [53]:
print("Model size after quantization")
print_model_size(net_quantized)

Model size after quantization
Model size: 92.95 KB


In [54]:
print("Accuracy Aefore Quantization")
test(net_quantized, test_loader)

Accuracy Aefore Quantization


100%|██████████| 1000/1000 [00:03<00:00, 272.72it/s]

Accuracy: 96.14



