In [1]:
#量化是AI软件技术栈的关键部分，用于压缩LLM或其他模型


# L2-B - Linear Quantization I: Get the Scale and Zero Point

In this lesson, continue to learn about fundamentals of linear quantization, and implement your own Linear Quantizer.

In [23]:
import torch

In [None]:
#同上一节的一样，这个包导入也是不成功的
from helper import linear_q_with_scale_and_zero_point, linear_dequantization, plot_quantization_errors


In [24]:
### a dummy tensor to test the implementation
test_tensor=torch.tensor(
    [[191.6, -13.5, 728.6],
     [92.14, 295.5,  -184],
     [0,     684.6, 245.5]]
)

In [25]:
q_min = torch.iinfo(torch.int8).min
q_max = torch.iinfo(torch.int8).max



In [26]:
q_min

-128

In [27]:
q_max

127

In [28]:
#r_min = test_tensor.min()
r_min = test_tensor.min().item()
r_min

-184.0

In [29]:
r_max = test_tensor.max().item()
r_max 

728.5999755859375

In [30]:
s = (r_max - r_min)/(q_max - q_min)
s

3.578823433670343

In [31]:
z = int(q_min - round(r_min/s))
z

-77

In [32]:
#但是这种直接按公式求zero point的方法，会有zero point out of range的问题，所以要处理一下，函数如下

![zeropointoutofrange](./img/02-zeroPointOutOfRange.png)

In [49]:
def get_q_scale_and_zero_point(r_tensor, dtype = torch.int8):
    #rmax, rmin, qmax, qmin
    q_min, q_max = torch.iinfo(dtype).min, torch.iinfo(dtype).max
    r_min, r_max = r_tensor.min().item(), r_tensor.max().item()

    #s and r
    scale = (r_max - r_min)/(q_max - q_min)
    zero_point = q_min - (r_min/scale)

    #clip the zero_pint to fall in [qmin, qmax]
    if zero_point < q_min:
        zero_point = q_min
    elif zero_point > q_max:
        zero_point = q_max
    else:
        #round and cast to int
        zero_point = int(round(zero_point))
    return scale, zero_point

In [50]:
new_scale, new_zero_point = get_q_scale_and_zero_point(test_tensor)

In [51]:
new_scale

3.578823433670343

In [52]:
new_zero_point

-77

In [53]:
def linear_q_with_scale_and_zero_point(
    tensor, scale, zero_point, dtype=torch.int8):
    scaled_and_shifted_tensor = tensor / scale + zero_point
    
    rounded_tensor = torch.round(scaled_and_shifted_tensor)
    #最后一步是，确保我们的舍入张量在最小量化值和最大量化值之间
    q_min = torch.iinfo(dtype).min#iInfo方法获取最小值和最大值
    q_max = torch.iinfo(dtype).max

    q_tensor = rounded_tensor.clamp(q_min, q_max).to(dtype)#定义量化张量（使用to()函数转换为我们想要的量化数据类型）
    return q_tensor

In [54]:
quantized_tensor = linear_q_with_scale_and_zero_point(test_tensor, new_scale, new_zero_point)

In [61]:
quantized_tensor

tensor([[ -23,  -81,  127],
        [ -51,    6, -128],
        [ -77,  114,   -8]], dtype=torch.int8)

In [62]:
def linear_dequantization(quantized_tensor, scale, zero_point):
    return scale * (quantized_tensor.float() - zero_point)

In [66]:
dequantized_tensor = linear_dequantization(quantized_tensor, new_scale, new_zero_point)

In [67]:
dequantized_tensor

tensor([[ 193.2565,  -14.3153,  730.0800],
        [  93.0494,  297.0423, -182.5200],
        [   0.0000,  683.5552,  246.9388]])

In [68]:
#同样，这个函数没有实现，展示视频中执行的结果
plot_quantization_errors(test_tensor, quantized_tensor, dequantized_tensor)

NameError: name 'plot_quantization_errors' is not defined

![dequantized.png](./img/02-dequantizedTensor.png)

In [69]:
(dequantized_tensor-test_tensor).square().mean()

tensor(1.5730)

In [70]:
def linear_quantization(tensor, dtype = torch.int8):
    scale, zero_point = get_q_scale_and_zero_point(tensor, dtype=dtype)
    quantized_tensor = linear_q_with_scale_and_zero_point(tensor, scale, zero_point, dtype=dtype)
    return quantized_tensor, scale, zero_point

In [71]:
r_tensor = torch.randn((4, 4))

In [72]:
r_tensor

tensor([[ 2.2076, -1.7764, -0.9115, -0.0222],
        [ 0.2167, -1.7484, -1.4990,  0.7100],
        [ 0.8893,  0.9245, -0.8773,  0.2604],
        [-1.0853,  0.8478, -0.3211, -0.5872]])

In [73]:
quantized_tensor, scale, zero_point = linear_quantization(r_tensor)

In [74]:
quantized_tensor

tensor([[ 127, -128,  -72,  -15],
        [   0, -126, -110,   31],
        [  43,   45,  -70,    3],
        [ -83,   40,  -35,  -52]], dtype=torch.int8)

In [75]:
scale

0.01562342737235275

In [76]:
zero_point

-14

In [80]:
dequantized_tensor = linear_dequantization(quantized_tensor, scale, zero_point)

In [None]:
#每次随机生成，图示不同，这个图只是视频中的测试值演示
plot_quantization_errors(r_tensor, quantized_tensor, dequantized_tensor)

![02-dequantizedTensor2.png](./img/02-dequantizedTensor2.png)

In [78]:
dequantized_tensor

tensor([[ 2.2029, -1.7811, -0.9062, -0.0156],
        [ 0.2187, -1.7498, -1.4998,  0.7031],
        [ 0.8905,  0.9218, -0.8749,  0.2656],
        [-1.0780,  0.8437, -0.3281, -0.5937]])

In [79]:
(dequantized_tensor-r_tensor).square().mean()

tensor(2.3408e-05)