In [1]:
# Depp nns usually have many params (millions , billions ..)
# which can require too many space if we are simply storing each param
# in 32bits , (7b -> 28GB)!! and this makes even the inference of the
# model on simple not so powerfull devices not feasible
# computers are (like humns) are slow in computing floating-point ops
# compared to integer ops , 2*2 vs 2.123*1.258

# Quantization aims to reduce the total amount of bits required to
# represent each param , usually by converting floating-point numbers to
# integeres , but not as simply rounding up or down the numbers ,
# this will result in more compressed representation of the model params
# which helps with using /training the model

# Quantization can also speed up computation because using simpler data
# types is faster (earlier example )

# in short :
"""
- less storage space required
- less computation
- less inference time
- less energy consumption
"""


# cpus and gpus use fixed number of bits to represent a piece of (primitive)
# data


'\n- less storage space required\n- less computation\n- less inference time\n- less energy consumption\n'

In [2]:
2**999

5357543035931336604742125245300009052807024058527668037218751941851755255624680612465991894078479290637973364587765734125935726428461570217992288787349287401967283887412115492710537302531185570938977091076523237491790970633699383779582771973038531457285598238843271083830214915826312193418602834034688

In [3]:
# You suprised that worked ?
# well python can represent big numbers by leveraging BigNum arithmetic
# BigNum is for working with large numbers that dont fit in 32/64 bits
# well python uses array of integers for large enough numbers
# a the number grows it keeps adding chunks
# so for BigNums it doesnt use cpu instruction like add , sub ...
# it implements this manual algorithms
# eg. in addition u add together small chunks nd carry over the overflow
# to the next chunk
# for mult it uses smarter algos :
# Karatsuba algorithm
# FFT-based Multiplication
# The Python interpreter is what calculating those NOT CPU

# remember the IEEE-754 ? lol first year trauma
# anyway it defines floating point numbers representation in 32bits as follows :
# first bit at left is for the sign
# the next 8 bits are the exponent
# the last are the fraction powers (first bit is 2^-1)

# val = (-1)**sign * 2^(E-127) * (1+ SUM[i=1..23, B[23-i]*2^(-i)])
# This is more like the scientific representation we studied in highschool or wtver
# to represent lets say (+/-)xxx.xx u need to write it in the form :
# (-1)^S * x.xxxx * 2^E  ; the bits after the dot are the mantissa bits (23 bs) ,
# S is the sign bit


# The exponent in the IEEE 754 format is stored as an unsigned integer, but we often need
# both positive and negative exponents to represent small and large numbers.
# So, instead of using a signed exponent,
# we use an unsigned exponent and add a bias (a fixed number) to shift the range.


# This means:
# Stored exponent e = actual_exponent + 127
# So, to get the real exponent:actual exponent=𝑒−127

# floating-point numbers scale much more because
# they use the exponent to jump to big or small ranges.


# Simple CPU ADD and MUL Instructions Don't Work on Floating-Point Numbers Directly.
# Instead, they require specialized hardware and instructions.
# Modern CPUs have a Floating Point Unit (FPU) — a dedicated part
# of the processor that knows
# CPUs (like x86) and GPUs use dedicated instructions for floating-point math.


# GPUs also support 16-bit floating point number with less precision

In [4]:
# USUALLY in nn , weights and biases are represented using floating-point numbers
# Quantization tries to use integer numbers to represent these two matrices while
# maintaining the accuracy of the model
# The Goal is to Quantize Input , Weight , Bias into the integer space
# so we perform all operations using integer arithmetic
# then we take the output , Dequantize , and send it to the next layer
# we need the next layer not even realize we quantized the previous
# so we should not CHANGE the model's output using quantization
# we need a mapping between floating points and ints without losing acc , meaning ...

# WELL , as u guessed it , we lose some information ; (info theo bb)
# we are trying to `compress` a huge CONITNUOUS domain , into a
# small DISCRETE domain [-127,127] , (we usually sacrifice the -128 to obtain a
# symetric domain/range)
# what we a re trying to do is keep the same distribution as the floating point
# numbers with respect to their domain (the actuall domain and not the possible domain)
# [MIN(float_weights),MAX(float_weights)] ,
# and we keep an ANCHOR that tells us how we scaled and how to scale back called the
# `zero_point` (idk if this name only holds for asymemtric quantization)
# there are two types of quantization
# asymetric : the zero of one representation not necessarly the zero of the other
# symetric : the zero is same for both when [min == max]

# In asymmetric quantization
# Xq = clamp(floor(xf/s),0,2^n -1) ; s = (alpha-beta)/(2^n -1)

# alpha the biggest float in the tensor , Beta the smallest
# we center using the z parameter
# z = round(-1 * (Beta/s))
# BIGGEST NUMBER MAPPED TO BIGGEST NUMBER
# Smallest number mapped to 0
# 0 mapped to the center point z

# dequantization  : Xf = s(Xq - z)
# we notice some information loss

# In symmetric quantization
# Xq = clamp(round(Xf/s),-(2^(n-1) -1) , (2^(n-1) -1) )
# alpha the biggest value in absolute form
# s = abs(alpha)/(2^(n-1) -1)



In [5]:
import numpy as np

# suppress scientific notation
np.set_printoptions(suppress=True)



# Generate randomly
X = np.random.uniform(low=-50,high=150 , size=20)


# For debugging purposes , lets make sure the important values are at the beggining
X[0] = X.max() + 1
X[1] = X.min() - 1
X[2] = 0


# Only keep two decimal places
X = np.round(X,2)


print(X)



[150.62 -50.23   0.   -19.75 149.62 -49.23  63.98 -22.26 -48.74 -32.55
  43.99  45.24  52.48 143.28 143.56  26.49  46.81 -17.5  -16.16 133.69]


In [6]:
# simple str8 forward
def clamp(params,lower_bound,upper_bound):
    params[params < lower_bound] = lower_bound
    params[params > upper_bound] = upper_bound
    return params

# Xq = clamp(floor(xf/s),0,2^n -1) ; s = (alpha-beta)/(2^n -1)
# z = round(-1 * (Beta/s))

def asymmetric_quantization(X,num_bits):
    s = (X.max() - X.min() )/ (2**num_bits -1)
    z = np.round(-1*(X.min()/s))
    Xq = clamp(np.round(X/s + z),0 , 2**num_bits -1).astype(np.int32)

    return Xq, s, z

# dequantization  : Xf = s(Xq - z)
def asymmetric_dequantization(Xq,scale,z):
    Xf = scale * (Xq - z)

    return Xf

# Xq = clamp(round(Xf/s),-(2^(n-1) -1) , (2^(n-1) -1) )
# alpha the biggest value in absolute form
# s = abs(alpha)/(2^(n-1) -1)

def symmetric_quantization(X,num_bits):
    alpha = np.max(np.abs(X))
    s = np.abs(alpha) / (2**(num_bits-1)-1)
    lower_bound , upper_bound = -(2**(num_bits-1)-1) , (2**(num_bits-1)-1)
    Xq = clamp(np.round(X/s),lower_bound,upper_bound)


    return Xq , s

# Xf = Xq * scale
def symmetric_dequantiztion(Xq,s):
    return Xq * s



def quantization_error(X,Xq):

    return np.mean((X-Xq)**2)




asym_q , asym_s,asym_z = asymmetric_quantization(X,8)
sym_q , sym_s = symmetric_quantization(X,8)



print(f'original : \n{np.round(X,2)}\n\n')
print(f'Asymmetric scale : {asym_s} , zero : {asym_z} \n{asym_q}\n\n')
print(f'Symmetric scale : {sym_s}\n{sym_q}')

# Notice in Sym , alot of range is unused








original : 
[150.62 -50.23   0.   -19.75 149.62 -49.23  63.98 -22.26 -48.74 -32.55
  43.99  45.24  52.48 143.28 143.56  26.49  46.81 -17.5  -16.16 133.69]


Asymmetric scale : 0.7876470588235294 , zero : 64.0 
[255   0  64  39 254   1 145  36   2  23 120 121 131 246 246  98 123  42
  43 234]


Symmetric scale : 1.185984251968504
[127. -42.   0. -17. 126. -42.  54. -19. -41. -27.  37.  38.  44. 121.
 121.  22.  39. -15. -14. 113.]


In [7]:
asym_deq_q = asymmetric_dequantization(asym_q,asym_s,asym_z)
sym_deq_q = symmetric_dequantiztion(sym_q,sym_s)


print(f'original : \n{np.round(X,2)}\n\n')
print(f'Asymmetric :{asym_deq_q}\n\n')
print(f'Symmetric  : {sym_deq_q}')


original : 
[150.62 -50.23   0.   -19.75 149.62 -49.23  63.98 -22.26 -48.74 -32.55
  43.99  45.24  52.48 143.28 143.56  26.49  46.81 -17.5  -16.16 133.69]


Asymmetric :[150.44058824 -50.40941176   0.         -19.69117647 149.65294118
 -49.62176471  63.79941176 -22.05411765 -48.83411765 -32.29352941
  44.10823529  44.89588235  52.77235294 143.35176471 143.35176471
  26.78        46.47117647 -17.32823529 -16.54058824 133.9       ]


Symmetric  : [150.62       -49.81133858   0.         -20.16173228 149.43401575
 -49.81133858  64.04314961 -22.53370079 -48.62535433 -32.0215748
  43.88141732  45.06740157  52.18330709 143.50409449 143.50409449
  26.09165354  46.25338583 -17.78976378 -16.60377953 134.01622047]


In [8]:
quantization_error(X,asym_deq_q)


np.float64(0.05278839100346078)

In [9]:
quantization_error(X,sym_deq_q)

np.float64(0.1063697786595571)

In [10]:
# In a trained nn , W , B are quantized , the input is quantized on the fly
# but how do we dequantize the Y ?
# well we run inference on the model on few inputs and observe typical outputs
# to calculate scale and zero , this is called `callibration`
# then we can dequantize the output of operations on the other quantized values
# using the params we just learned

# GPUs actually speed up linear layer using Multiply-Accumulate (MAC)
# this op is performed in parallel for every row and column using many MAC blocks
# GEMM library

In [11]:
# Strategies to choosing [Alpha , Beta] (the params of the scale)
# - MinMax as we did up . This is sensitive to outliers, the error big for all but outlier
# - Percentile , we dont rely on the outlier , the error big for the outlier
# - MSE : GRID search to find those two that minimize MSE
# - Cross-Entropy : create a proba distribution , choose alpha and beta such that
# softmax(X) and softmax(Xq) is the same/very close

# in CNN ,for each kernel is better to have alpha and beta , to not throw range



In [12]:

params = np.random.uniform(low=-50,high=150,size=10000)

# outlier    VVVV
params[-1] = 1000

params = np.round(params,2)

def asymmetric_quantization_percentile(X,num_bits,percentile=99.99):
    alpha = np.percentile(X,percentile)
    beta = np.percentile(X,100-percentile)
    s = (alpha-beta) / (2**num_bits -10)
    z = -1*np.round(beta/s)
    lower_bound , upper_bound = 0 , 2**num_bits -1
    quantized = clamp(np.round(X/s +z),lower_bound,upper_bound).astype(np.int32)
    return quantized , s , z

asym_q , asym_s,asym_z = asymmetric_quantization(params,8)
asymp_q , asymp_s,asymp_z = asymmetric_quantization_percentile(params,8)

print(f'original : \n{np.round(params,2)}\n\n')
print(f'Asymmetric scale : {asym_s} , zero : {asym_z} \n{asym_q}\n\n')
print(f'Asymmetric scale : {asymp_s} , zero : {asymp_z} \n{asymp_q}\n\n')


asym_deq_q = asymmetric_dequantization(asym_q,asym_s,asym_z)
asymp_deq_q = asymmetric_dequantization(asymp_q,asymp_s,asymp_z)



print(f'original : \n{np.round(X,2)}\n\n')
print(f'Asymmetric :{asym_deq_q}\n\n')
print(f'Asymmetric P  : {asymp_deq_q}')


original : 
[ -40.45  130.33   69.88 ...  114.5   137.85 1000.  ]


Asymmetric scale : 4.117529411764706 , zero : 12.0 
[  2  44  29 ...  40  45 255]


Asymmetric scale : 0.8130691219487759 , zero : 61.0 
[ 11 221 147 ... 202 231 255]


original : 
[150.62 -50.23   0.   -19.75 149.62 -49.23  63.98 -22.26 -48.74 -32.55
  43.99  45.24  52.48 143.28 143.56  26.49  46.81 -17.5  -16.16 133.69]


Asymmetric :[ -41.17529412  131.76094118   69.998      ...  115.29082353  135.87847059
 1000.55964706]


Asymmetric P  : [-40.6534561  130.09105951  69.92394449 ... 114.64274619 138.22175073
 157.73540966]


In [13]:
quantization_error(params[:-1],asym_deq_q[:-1])


np.float64(1.406541621949046)

In [14]:
quantization_error(params[:],asymp_deq_q[:])


np.float64(70.99588247300174)

In [15]:
quantization_error(params[:-1],asymp_deq_q[:-1])


np.float64(0.05492395100969596)

In [16]:
# Quantization after training : Post Training Quantization

"""
we need the pretrained model , and some `unlabeled` data .
we take the model attach observers , this observers during inference on that data
they will calculate statistics on the data such as s and z params  calibrate
then we quantize the model

"""




'\nwe need the pretrained model , and some `unlabeled` data .\nwe take the model attach observers , this observers during inference on that data\nthey will calculate statistics on the data such as s and z params  calibrate\nthen we quantize the model\n\n'

In [41]:
import torch
import torch.nn as nn
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from tqdm import tqdm
from pathlib import Path
import os

torch.manual_seed(0)

<torch._C.Generator at 0x7b2320491d30>

In [45]:
transform = transforms.Compose([
        transforms.ToTensor(),transforms.Normalize((0.1307,),(0.3081,))
])

mnist_training = datasets.MNIST(root='./data',train=True,download=True,transform=transform)

train_loader = DataLoader(mnist_training,batch_size=8,shuffle=True)


mnist_test = datasets.MNIST(root='./data',train=False,download=True,transform=transform)

test_loader = DataLoader(mnist_test,batch_size=8,shuffle=True)

device= 'cuda' if torch.cuda.is_available() else 'cpu'


In [25]:
class Net(nn.Module):
    def __init__(self,hidden_size1=100,hidden_size2=100):
        super().__init__()

        self.linear1 = nn.Linear(28*28,hidden_size1)
        self.linear2 = nn.Linear(hidden_size1,hidden_size2)
        self.linear3 = nn.Linear(hidden_size2,10)
        self.relu    = nn.ReLU()


    def forward(self,x):
        x = x.view(-1,28*28)
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = self.linear3(x)
        return x


In [26]:
net = Net().to(device)

In [31]:
def train(model , train_loader,epochs=5):
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)

    for epoch in range(epochs):
        model.train()
        loss_sum = 0
        data_iterator = tqdm(train_loader,desc=f'Epocj : {epoch+1}/{epochs}')
        batch_number = 0
        for data in data_iterator :
            batch_number +=1
            x,y = data
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            output = model(x.view(-1,28*28))
            batch_loss = loss_fn(output,y)
            loss_sum += batch_loss
            avg_loss = loss_sum/batch_number
            data_iterator.set_postfix(loss=avg_loss)

            batch_loss.backward()
            optimizer.step()



def _print_size_of_model(model):
    torch.save(model.state_dict(),"temp_model.p")
    print(f'Size (KB) :{os.path.getsize("temp_model.p")/1_000}')
    os.remove('temp_model.p')






In [32]:
train(net,train_loader)

Epocj : 1/5: 100%|██████████| 7500/7500 [01:16<00:00, 97.77it/s, loss=tensor(0.2242, grad_fn=<DivBackward0>)]
Epocj : 2/5: 100%|██████████| 7500/7500 [01:15<00:00, 99.80it/s, loss=tensor(0.1166, grad_fn=<DivBackward0>)]
Epocj : 3/5: 100%|██████████| 7500/7500 [01:13<00:00, 101.43it/s, loss=tensor(0.0904, grad_fn=<DivBackward0>)]
Epocj : 4/5: 100%|██████████| 7500/7500 [01:00<00:00, 123.43it/s, loss=tensor(0.0771, grad_fn=<DivBackward0>)]
Epocj : 5/5: 100%|██████████| 7500/7500 [00:59<00:00, 126.60it/s, loss=tensor(0.0678, grad_fn=<DivBackward0>)]


In [46]:
def test(model,test_loader):
    correct = 0
    total = 0

    model.eval()

    with torch.no_grad():
        for data in tqdm(test_loader,desc='Testing..'):
            x,y = data
            x = x.to(device)
            y = y.to(device)
            output = model(x.view(-1,28*28))

            for idx , item in enumerate(output):
                # since the index of the label is the value of the label
                if torch.argmax(item) == y[idx]:
                    correct += 1
                total+=1

    print(f'Accuracy : {round(correct/total,3)}')

In [35]:
print('Weights Before Quantization')
print(net.linear1.weight)
print(net.linear1.weight.dtype)

Weights Before Quantization
Parameter containing:
tensor([[ 0.0078,  0.0272, -0.0214,  ...,  0.0300,  0.0118,  0.0101],
        [ 0.0582,  0.0630,  0.0676,  ...,  0.0577,  0.0720,  0.0481],
        [ 0.0008,  0.0359, -0.0123,  ...,  0.0006,  0.0222,  0.0290],
        ...,
        [ 0.0500,  0.0538,  0.0191,  ...,  0.0138,  0.0330, -0.0039],
        [ 0.0084,  0.0165,  0.0485,  ...,  0.0288,  0.0289,  0.0284],
        [ 0.0063,  0.0010, -0.0132,  ...,  0.0232, -0.0261, -0.0059]],
       requires_grad=True)
torch.float32


In [36]:
print('Size of the Model before Quant')
_print_size_of_model(net)

Size of the Model before Quant
Size (KB) :360.998


In [47]:
print(f'Before Quantization')
test(net,test_loader)

Before Quantization


Testing..: 100%|██████████| 1250/1250 [00:03<00:00, 359.96it/s]

Accuracy : 0.968





In [50]:
# we insert min-max observers in the model

# QuantStub(): Marks the point where float inputs should be quantized into int8.
# DeQuantStub(): Marks the point where int8 outputs should be dequantized back into float32.
# They're placeholders for inserting quantization logic during calibration or conversion.


class QuantizedNet(nn.Module):
    def __init__(self,hidden_size1=100,hidden_size2=100):
        super().__init__()
        # This is just to define in forward where quant starts
        self.quant = torch.quantization.QuantStub()
        self.linear1 = nn.Linear(28*28,hidden_size1)
        self.linear2 = nn.Linear(hidden_size1,hidden_size2)
        self.linear3 = nn.Linear(hidden_size2,10)

        self.relu    = nn.ReLU()
        # This is just to define in forward where quant ends
        self.dequant = torch.quantization.DeQuantStub()


    def forward(self,x):
        x = x.view(-1,28*28)
        x = self.quant(x)  # float → int8
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = self.linear3(x)
        x = self.dequant(x) # int8 → float

        return x



In [53]:
quantized_net = QuantizedNet().to(device)
# we copy the floating point params to this template
quantized_net.load_state_dict(net.state_dict())
# we stop some training techniques because we want the observers to measure
# actual inference behaviour
quantized_net.eval()

# configuration like which observer to use , symmetric or asymetric
# default min-max , weights symmetric , activations asymmetric , fbgemm for cpu backend
quantized_net.qconfig = torch.ao.quantization.default_qconfig

# this insert the observers module into the model
# sensors to watch activations  , weights , biases ... during calibration
quantized_net = torch.ao.quantization.prepare(quantized_net)

# at this point the model still runs in float32
# But every QuantStub, Linear, and ReLU ... is
# now wrapped with extra logic to track tensor ranges.
quantized_net




QuantizedNet(
  (quant): QuantStub(
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear1): Linear(
    in_features=784, out_features=100, bias=True
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear2): Linear(
    in_features=100, out_features=100, bias=True
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear3): Linear(
    in_features=100, out_features=10, bias=True
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (relu): ReLU()
  (dequant): DeQuantStub()
)

In [None]:
# QuantStub and DeQuantStub are entry/exit markers for quantization in your model’s
#  flow.

# Observer modules are behind-the-scenes tools inserted by PyTorch to track tensor ranges
# and compute the quantization parameters (scale + zero point).

# You write the stubs; PyTorch inserts the observers for you during prepare().


# QuantStub	Like a camera lens — it lets you switch from one format (float) to
# another (int), but doesn’t decide how.
# Observer	Like a light meter — it looks at the scene (tensor) and tells
#  the camera what exposure settings (scale/zero_point) to use.

In [54]:
# we calibrate the model now
# we can just run inference with test
test(quantized_net,test_loader)

Testing..: 100%|██████████| 1250/1250 [00:04<00:00, 255.80it/s]

Accuracy : 0.968





In [55]:
# the observers have collected some statistics
quantized_net

QuantizedNet(
  (quant): QuantStub(
    (activation_post_process): MinMaxObserver(min_val=-0.4242129623889923, max_val=2.821486711502075)
  )
  (linear1): Linear(
    in_features=784, out_features=100, bias=True
    (activation_post_process): MinMaxObserver(min_val=-91.94515991210938, max_val=54.72724151611328)
  )
  (linear2): Linear(
    in_features=100, out_features=100, bias=True
    (activation_post_process): MinMaxObserver(min_val=-79.45330810546875, max_val=50.38602828979492)
  )
  (linear3): Linear(
    in_features=100, out_features=10, bias=True
    (activation_post_process): MinMaxObserver(min_val=-86.16403198242188, max_val=30.331398010253906)
  )
  (relu): ReLU()
  (dequant): DeQuantStub()
)

In [56]:
# until now we were using the float32 model and some observers to
# collect the necessary statistics to actually get  a quantized model

quantized_net = torch.ao.quantization.convert(quantized_net)

In [57]:
quantized_net

QuantizedNet(
  (quant): Quantize(scale=tensor([0.0256]), zero_point=tensor([17]), dtype=torch.quint8)
  (linear1): QuantizedLinear(in_features=784, out_features=100, scale=1.1549007892608643, zero_point=80, qscheme=torch.per_tensor_affine)
  (linear2): QuantizedLinear(in_features=100, out_features=100, scale=1.0223569869995117, zero_point=78, qscheme=torch.per_tensor_affine)
  (linear3): QuantizedLinear(in_features=100, out_features=10, scale=0.9172868728637695, zero_point=94, qscheme=torch.per_tensor_affine)
  (relu): ReLU()
  (dequant): DeQuantize()
)

In [64]:
print('Weights Before Quantization')
print(torch.int_repr(quantized_net.linear1.weight()))
print()
print()
print(net.linear1.weight)
print()
print()
print(torch.dequantize(quantized_net.linear1.weight()))

Weights Before Quantization
tensor([[ 1,  2, -2,  ...,  3,  1,  1],
        [ 5,  6,  6,  ...,  5,  6,  4],
        [ 0,  3, -1,  ...,  0,  2,  3],
        ...,
        [ 4,  5,  2,  ...,  1,  3,  0],
        [ 1,  1,  4,  ...,  3,  3,  3],
        [ 1,  0, -1,  ...,  2, -2, -1]], dtype=torch.int8)


Parameter containing:
tensor([[ 0.0078,  0.0272, -0.0214,  ...,  0.0300,  0.0118,  0.0101],
        [ 0.0582,  0.0630,  0.0676,  ...,  0.0577,  0.0720,  0.0481],
        [ 0.0008,  0.0359, -0.0123,  ...,  0.0006,  0.0222,  0.0290],
        ...,
        [ 0.0500,  0.0538,  0.0191,  ...,  0.0138,  0.0330, -0.0039],
        [ 0.0084,  0.0165,  0.0485,  ...,  0.0288,  0.0289,  0.0284],
        [ 0.0063,  0.0010, -0.0132,  ...,  0.0232, -0.0261, -0.0059]],
       requires_grad=True)


tensor([[ 0.0111,  0.0223, -0.0223,  ...,  0.0334,  0.0111,  0.0111],
        [ 0.0556,  0.0668,  0.0668,  ...,  0.0556,  0.0668,  0.0445],
        [ 0.0000,  0.0334, -0.0111,  ...,  0.0000,  0.0223,  0.0334],
   

In [65]:
_print_size_of_model(quantized_net)
# original size / 4 and some overhead

Size (KB) :95.394


In [68]:
test(quantized_net,test_loader)
# Dont judge on this small simple model

Testing..: 100%|██████████| 1250/1250 [00:04<00:00, 304.15it/s]

Accuracy : 0.968





In [None]:
# UNTIL NOW WE WERE USING PTQ,
# now we try quantization aware training

In [69]:
# we inert fake modules in the computtionl graph to simulate the effect of
# the quantization
# we insert -between each layer some special fake ops quantize and
#  dequantize operations- we are not relly quantizing , but we do it on the fly
# this will introduce quantization error , and hopefully the backward pass of the
# loss will be aware of quantization
#

In [70]:
T_A_quantized_net = QuantizedNet().to(device)

In [73]:
T_A_quantized_net.qconfig = torch.ao.quantization.default_qconfig

T_A_quantized_net.train()

# the previous prepare works on the premise the net is stable
# this works on the premise that the net is undergoing training

T_A_quantized_net = torch.ao.quantization.prepare_qat(T_A_quantized_net)
T_A_quantized_net
# model not trained , we already inserted observers which are not calibrated

QuantizedNet(
  (quant): QuantStub(
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear1): Linear(
    in_features=784, out_features=100, bias=True
    (weight_fake_quant): MinMaxObserver(min_val=inf, max_val=-inf)
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear2): Linear(
    in_features=100, out_features=100, bias=True
    (weight_fake_quant): MinMaxObserver(min_val=inf, max_val=-inf)
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear3): Linear(
    in_features=100, out_features=10, bias=True
    (weight_fake_quant): MinMaxObserver(min_val=inf, max_val=-inf)
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (relu): ReLU()
  (dequant): DeQuantStub()
)

In [74]:
train(T_A_quantized_net,train_loader)

Epocj : 1/5: 100%|██████████| 7500/7500 [01:12<00:00, 102.94it/s, loss=tensor(0.2193, grad_fn=<DivBackward0>)]
Epocj : 2/5: 100%|██████████| 7500/7500 [01:20<00:00, 93.32it/s, loss=tensor(0.1153, grad_fn=<DivBackward0>)] 
Epocj : 3/5: 100%|██████████| 7500/7500 [01:12<00:00, 103.97it/s, loss=tensor(0.0890, grad_fn=<DivBackward0>)]
Epocj : 4/5: 100%|██████████| 7500/7500 [01:12<00:00, 103.07it/s, loss=tensor(0.0789, grad_fn=<DivBackward0>)]
Epocj : 5/5: 100%|██████████| 7500/7500 [01:16<00:00, 97.42it/s, loss=tensor(0.0660, grad_fn=<DivBackward0>)]


In [75]:
T_A_quantized_net

QuantizedNet(
  (quant): QuantStub(
    (activation_post_process): MinMaxObserver(min_val=-0.4242129623889923, max_val=2.821486711502075)
  )
  (linear1): Linear(
    in_features=784, out_features=100, bias=True
    (weight_fake_quant): MinMaxObserver(min_val=-1.502736210823059, max_val=0.7772015929222107)
    (activation_post_process): MinMaxObserver(min_val=-86.461669921875, max_val=60.17536926269531)
  )
  (linear2): Linear(
    in_features=100, out_features=100, bias=True
    (weight_fake_quant): MinMaxObserver(min_val=-0.8394451141357422, max_val=0.5752117037773132)
    (activation_post_process): MinMaxObserver(min_val=-76.77043914794922, max_val=52.59672927856445)
  )
  (linear3): Linear(
    in_features=100, out_features=10, bias=True
    (weight_fake_quant): MinMaxObserver(min_val=-0.8849999308586121, max_val=0.27469488978385925)
    (activation_post_process): MinMaxObserver(min_val=-105.8464584350586, max_val=38.001827239990234)
  )
  (relu): ReLU()
  (dequant): DeQuantStub()


In [76]:
# in QAT we have weight fake quant , observers that collected during training
T_A_quantized_net.eval()

# now we can quantize the model with stats collected during training
T_A_quantized_net = torch.ao.quantization.convert(T_A_quantized_net)

In [77]:
T_A_quantized_net

QuantizedNet(
  (quant): Quantize(scale=tensor([0.0256]), zero_point=tensor([17]), dtype=torch.quint8)
  (linear1): QuantizedLinear(in_features=784, out_features=100, scale=1.1546223163604736, zero_point=75, qscheme=torch.per_tensor_affine)
  (linear2): QuantizedLinear(in_features=100, out_features=100, scale=1.0186392068862915, zero_point=75, qscheme=torch.per_tensor_affine)
  (linear3): QuantizedLinear(in_features=100, out_features=10, scale=1.132663607597351, zero_point=93, qscheme=torch.per_tensor_affine)
  (relu): ReLU()
  (dequant): DeQuantize()
)

In [80]:
print('Weights Before Quantization')
print(torch.int_repr(T_A_quantized_net.linear1.weight()))
print()
print()
print(T_A_quantized_net.linear1.weight())
print()
print()
print(torch.dequantize(T_A_quantized_net.linear1.weight()))

Weights Before Quantization
tensor([[ 2,  4,  2,  ...,  6,  6,  7],
        [-2,  3,  0,  ..., -2,  2, -1],
        [ 4, -1,  2,  ..., -1,  4,  0],
        ...,
        [ 0,  1,  4,  ...,  1,  3,  5],
        [-1,  0,  0,  ...,  3,  3,  3],
        [ 6,  2,  2,  ...,  3,  4,  5]], dtype=torch.int8)


tensor([[ 0.0236,  0.0471,  0.0236,  ...,  0.0707,  0.0707,  0.0825],
        [-0.0236,  0.0354,  0.0000,  ..., -0.0236,  0.0236, -0.0118],
        [ 0.0471, -0.0118,  0.0236,  ..., -0.0118,  0.0471,  0.0000],
        ...,
        [ 0.0000,  0.0118,  0.0471,  ...,  0.0118,  0.0354,  0.0589],
        [-0.0118,  0.0000,  0.0000,  ...,  0.0354,  0.0354,  0.0354],
        [ 0.0707,  0.0236,  0.0236,  ...,  0.0354,  0.0471,  0.0589]],
       size=(100, 784), dtype=torch.qint8,
       quantization_scheme=torch.per_tensor_affine, scale=0.011786166578531265,
       zero_point=0)


tensor([[ 0.0236,  0.0471,  0.0236,  ...,  0.0707,  0.0707,  0.0825],
        [-0.0236,  0.0354,  0.0000,  ..., -0.023

In [79]:
test(T_A_quantized_net,test_loader)

Testing..: 100%|██████████| 1250/1250 [00:05<00:00, 241.95it/s]

Accuracy : 0.971





In [None]:
# Typically QAT performs better than post training quantization
# Operation of quantization is not differentiable so how do we go back ?
# we usually approximate the gradient
# for all the values between alpha and beta get gradient of 1
# else gradient 0


# the effect of QAT on loss function optimization
# when we train model with not quantization , we try to find the minimize the loss
# in quantization aware training we also try the minimize the loss , but we want
# the local minimum to be more wide
#  in post training quantization , the effects of quant dequant can be too big
# maybe the curvature (second derivative is high in that spot)
# in QAT we want the change of quant and dequant to be small ,
# so we find a minimum that has small curvature (small second deriv (my thought))

# basically we try to account for the quant dequant error
