In [None]:
# Depp nns usually have many params (millions , billions ..)
# which can require too many space if we are simply storing each param
# in 32bits , (7b -> 28GB)!! and this makes even the inference of the
# model on simple not so powerfull devices not feasible
# computers are (like humns) are slow in computing floating-point ops
# compared to integer ops , 2*2 vs 2.123*1.258

# Quantization aims to reduce the total amount of bits required to
# represent each param , usually by converting floating-point numbers to
# integeres , but not as simply rounding up or down the numbers ,
# this will result in more compressed representation of the model params
# which helps with using /training the model

# Quantization can also speed up computation because using simpler data
# types is faster (earlier example )

# in short :
"""
- less storage space required
- less computation
- less inference time
- less energy consumption
"""


# cpus and gpus use fixed number of bits to represent a piece of (primitive)
# data


In [None]:
2**999

5357543035931336604742125245300009052807024058527668037218751941851755255624680612465991894078479290637973364587765734125935726428461570217992288787349287401967283887412115492710537302531185570938977091076523237491790970633699383779582771973038531457285598238843271083830214915826312193418602834034688

In [None]:
# You suprised that worked ?
# well python can represent big numbers by leveraging BigNum arithmetic
# BigNum is for working with large numbers that dont fit in 32/64 bits
# well python uses array of integers for large enough numbers
# a the number grows it keeps adding chunks
# so for BigNums it doesnt use cpu instruction like add , sub ...
# it implements this manual algorithms
# eg. in addition u add together small chunks nd carry over the overflow
# to the next chunk
# for mult it uses smarter algos :
# Karatsuba algorithm
# FFT-based Multiplication
# The Python interpreter is what calculating those NOT CPU

# remember the IEEE-754 ? lol first year trauma
# anyway it defines floating point numbers representation in 32bits as follows :
# first bit at left is for the sign
# the next 8 bits are the exponent
# the last are the fraction powers (first bit is 2^-1)

# val = (-1)**sign * 2^(E-127) * (1+ SUM[i=1..23, B[23-i]*2^(-i)])
# This is more like the scientific representation we studied in highschool or wtver
# to represent lets say (+/-)xxx.xx u need to write it in the form :
# (-1)^S * x.xxxx * 2^E  ; the bits after the dot are the mantissa bits (23 bs) ,
# S is the sign bit


# The exponent in the IEEE 754 format is stored as an unsigned integer, but we often need
# both positive and negative exponents to represent small and large numbers.
# So, instead of using a signed exponent,
# we use an unsigned exponent and add a bias (a fixed number) to shift the range.


# This means:
# Stored exponent e = actual_exponent + 127
# So, to get the real exponent:actual exponent=𝑒−127

# floating-point numbers scale much more because
# they use the exponent to jump to big or small ranges.


# Simple CPU ADD and MUL Instructions Don't Work on Floating-Point Numbers Directly.
# Instead, they require specialized hardware and instructions.
# Modern CPUs have a Floating Point Unit (FPU) — a dedicated part
# of the processor that knows
# CPUs (like x86) and GPUs use dedicated instructions for floating-point math.


# GPUs also support 16-bit floating point number with less precision

In [None]:
# USUALLY in nn , weights and biases are represented using floating-point numbers
# Quantization tries to use integer numbers to represent these two matrices while
# maintaining the accuracy of the model
# The Goal is to Quantize Input , Weight , Bias into the integer space
# so we perform all operations using integer arithmetic
# then we take the output , Dequantize , and send it to the next layer
# we need the next layer not even realize we quantized the previous
# so we should not CHANGE the model's output using quantization
# we need a mapping between floating points and ints without losing acc , meaning ...

# WELL , as u guessed it , we lose some information ; (info theo bb)
# we are trying to `compress` a huge CONITNUOUS domain , into a
# small DISCRETE domain [-127,127] , (we usually sacrifice the -128 to obtain a
# symetric domain/range)
# what we a re trying to do is keep the same distribution as the floating point
# numbers with respect to their domain (the actuall domain and not the possible domain)
# [MIN(float_weights),MAX(float_weights)] ,
# and we keep an ANCHOR that tells us how we scaled and how to scale back called the
# `zero_point` (idk if this name only holds for asymemtric quantization)
# there are two types of quantization
# asymetric : the zero of one representation not necessarly the zero of the other
# symetric : the zero is same for both when [min == max]

# In asymmetric quantization
# Xq = clamp(floor(xf/s),0,2^n -1) ; s = (alpha-beta)/(2^n -1)

# alpha the biggest float in the tensor , Beta the smallest
# we center using the z parameter
# z = round(-1 * (Beta/s))
# BIGGEST NUMBER MAPPED TO BIGGEST NUMBER
# Smallest number mapped to 0
# 0 mapped to the center point z

# dequantization  : Xf = s(Xq - z)
# we notice some information loss

# In symmetric quantization
# Xq = clamp(round(Xf/s),-(2^(n-1) -1) , (2^(n-1) -1) )
# alpha the biggest value in absolute form
# s = abs(alpha)/(2^(n-1) -1)



In [2]:
import numpy as np

# suppress scientific notation
np.set_printoptions(suppress=True)



# Generate randomly
X = np.random.uniform(low=-50,high=150 , size=20)


# For debugging purposes , lets make sure the important values are at the beggining
X[0] = X.max() + 1
X[1] = X.min() - 1
X[2] = 0


# Only keep two decimal places
X = np.round(X,2)


print(X)



[140.92 -49.87   0.    55.88 -48.87 132.91 -27.78 -22.58 105.08  17.84
  60.82 115.05 139.92  78.98  38.04  46.68  69.47  54.5   -5.47 -22.71]


In [6]:
# simple str8 forward
def clamp(params,lower_bound,upper_bound):
    params[params < lower_bound] = lower_bound
    params[params > upper_bound] = upper_bound
    return params

# Xq = clamp(floor(xf/s),0,2^n -1) ; s = (alpha-beta)/(2^n -1)
# z = round(-1 * (Beta/s))

def asymmetric_quantization(X,num_bits):
    s = (X.max() - X.min() )/ (2**num_bits -1)
    z = np.round(-1*(X.min()/s))
    Xq = clamp(np.round(X/s + z),0 , 2**num_bits -1).astype(np.int32)

    return Xq, s, z

# dequantization  : Xf = s(Xq - z)
def asymmetric_dequantization(Xq,scale,z):
    Xf = scale * (Xq - z)

    return Xf

# Xq = clamp(round(Xf/s),-(2^(n-1) -1) , (2^(n-1) -1) )
# alpha the biggest value in absolute form
# s = abs(alpha)/(2^(n-1) -1)

def symmetric_quantization(X,num_bits):
    alpha = np.max(np.abs(X))
    s = np.abs(alpha) / (2**(num_bits-1)-1)
    lower_bound , upper_bound = -(2**(num_bits-1)-1) , (2**(num_bits-1)-1)
    Xq = clamp(np.round(X/s),lower_bound,upper_bound)


    return Xq , s

# Xf = Xq * scale
def symmetric_dequantiztion(Xq,s):
    return Xq * s



def quantization_error(X,Xq):

    return np.mean((X-Xq)**2)




asym_q , asym_s,asym_z = asymmetric_quantization(X,8)
sym_q , sym_s = symmetric_quantization(X,8)



print(f'original : \n{np.round(X,2)}\n\n')
print(f'Asymmetric scale : {asym_s} , zero : {asym_z} \n{asym_q}\n\n')
print(f'Symmetric scale : {sym_s}\n{sym_q}')

# Notice in Sym , alot of range is unused








original : 
[140.92 -49.87   0.    55.88 -48.87 132.91 -27.78 -22.58 105.08  17.84
  60.82 115.05 139.92  78.98  38.04  46.68  69.47  54.5   -5.47 -22.71]


Asymmetric scale : 0.7481960784313725 , zero : 67.0 
[255   0  67 142   2 245  30  37 207  91 148 221 254 173 118 129 160 140
  60  37]


Symmetric scale : 1.1096062992125983
[127. -45.   0.  50. -44. 120. -25. -20.  95.  16.  55. 104. 126.  71.
  34.  42.  63.  49.  -5. -20.]


In [7]:
asym_deq_q = asymmetric_dequantization(asym_q,asym_s,asym_z)
sym_deq_q = symmetric_dequantiztion(sym_q,sym_s)


print(f'original : \n{np.round(X,2)}\n\n')
print(f'Asymmetric :{asym_deq_q}\n\n')
print(f'Symmetric  : {sym_deq_q}')


original : 
[140.92 -49.87   0.    55.88 -48.87 132.91 -27.78 -22.58 105.08  17.84
  60.82 115.05 139.92  78.98  38.04  46.68  69.47  54.5   -5.47 -22.71]


Asymmetric :[140.66086275 -50.12913725   0.          56.11470588 -48.6327451
 133.17890196 -27.6832549  -22.44588235 104.74745098  17.95670588
  60.60388235 115.22219608 139.91266667  79.30878431  38.158
  46.38815686  69.58223529  54.61831373  -5.23737255 -22.44588235]


Symmetric  : [140.92       -49.93228346   0.          55.48031496 -48.82267717
 133.15275591 -27.74015748 -22.19212598 105.41259843  17.75370079
  61.02834646 115.39905512 139.8103937   78.78204724  37.72661417
  46.60346457  69.90519685  54.37070866  -5.5480315  -22.19212598]


In [8]:
quantization_error(X,asym_deq_q)


np.float64(0.04518108581314796)

In [9]:
quantization_error(X,sym_deq_q)

np.float64(0.06479050251100521)

In [10]:
# In a trained nn , W , B are quantized , the input is quantized on the fly
# but how do we dequantize the Y ?
# well we run inference on the model on few inputs and observe typical outputs
# to calculate scale and zero , this is called `callibration`
# then we can dequantize the output of operations on the other quantized values
# using the params we just learned

# GPUs actually speed up linear layer using Multiply-Accumulate (MAC)
# this op is performed in parallel for every row and column using many MAC blocks
# GEMM library

In [None]:
# Strategies to choosing [Alpha , Beta] (the params of the scale)
# - MinMax as we did up . This is sensitive to outliers, the error big for all but outlier
# - Percentile , we dont rely on the outlier , the error big for the outlier
# - MSE : GRID search to find those two that minimize MSE
# - Cross-Entropy : create a proba distribution , choose alpha and beta such that
# softmax(X) and softmax(Xq) is the same/very close

# in CNN ,for each kernel is better to have alpha and beta , to not throw range



In [15]:

params = np.random.uniform(low=-50,high=150,size=10000)

# outlier    VVVV
params[-1] = 1000

params = np.round(params,2)

def asymmetric_quantization_percentile(X,num_bits,percentile=99.99):
    alpha = np.percentile(X,percentile)
    beta = np.percentile(X,100-percentile)
    s = (alpha-beta) / (2**num_bits -10)
    z = -1*np.round(beta/s)
    lower_bound , upper_bound = 0 , 2**num_bits -1
    quantized = clamp(np.round(X/s +z),lower_bound,upper_bound).astype(np.int32)
    return quantized , s , z

asym_q , asym_s,asym_z = asymmetric_quantization(params,8)
asymp_q , asymp_s,asymp_z = asymmetric_quantization_percentile(params,8)

print(f'original : \n{np.round(params,2)}\n\n')
print(f'Asymmetric scale : {asym_s} , zero : {asym_z} \n{asym_q}\n\n')
print(f'Asymmetric scale : {asymp_s} , zero : {asymp_z} \n{asymp_q}\n\n')


asym_deq_q = asymmetric_dequantization(asym_q,asym_s,asym_z)
asymp_deq_q = asymmetric_dequantization(asymp_q,asymp_s,asymp_z)



print(f'original : \n{np.round(X,2)}\n\n')
print(f'Asymmetric :{asym_deq_q}\n\n')
print(f'Asymmetric P  : {asymp_deq_q}')


original : 
[ 131.24   61.94   -0.56 ...    3.4    -9.99 1000.  ]


Asymmetric scale : 4.117529411764706 , zero : 12.0 
[ 44  27  12 ...  13  10 255]


Asymmetric scale : 0.8131097682902392 , zero : 61.0 
[222 137  60 ...  65  49 255]


original : 
[140.92 -49.87   0.    55.88 -48.87 132.91 -27.78 -22.58 105.08  17.84
  60.82 115.05 139.92  78.98  38.04  46.68  69.47  54.5   -5.47 -22.71]


Asymmetric :[ 131.76094118   61.76294118    0.         ...    4.11752941   -8.23505882
 1000.55964706]


Asymmetric P  : [130.91067269  61.79634239  -0.81310977 ...   3.25243907  -9.75731722
 157.74329505]


In [18]:
quantization_error(params[:-1],asym_deq_q[:-1])


np.float64(1.4216433527615735)

In [20]:
quantization_error(params[:],asymp_deq_q[:])


np.float64(70.99464757401762)

In [19]:
quantization_error(params[:-1],asymp_deq_q[:-1])


np.float64(0.055017372146425755)