# 自动量化

以 PyTorch 前端为例阐述 TVM 自动量化机制。

In [1]:
import numpy as np
import tvm
from tvm.runtime.vm import VirtualMachine
from tvm import relay
from torch import nn
import torch

创建单层卷积：

In [2]:
class Model(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.conv = nn.Conv2d(16, 16, 3, 1, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.conv(x)
        x = self.relu(x)
        return x

TVM 接受 {func}`torch.jit.trace` 后的模型：

In [3]:
pt_model = Model().eval().float()
ishape = (1, 16, 4, 4)
input_shapes = [("data", ishape)]
# script_module = torch.jit.script(pt_model)
# mod, params = relay.frontend.from_pytorch(script_module, input_shapes)
idata = torch.rand(ishape)
traced_model = torch.jit.trace(pt_model, idata)
# traced_model 翻译为 TVM 前端模型
mod, params = relay.frontend.from_pytorch(traced_model, input_shapes, use_parser_friendly_name=True)
print(mod["main"])

fn (%data: Tensor[(1, 16, 4, 4), float32] /* span=aten___convolution_0_data:0:0 */, %aten___convolution_0_weight: Tensor[(16, 16, 3, 3), float32] /* span=aten___convolution_0_weight:0:0 */, %aten___convolution_0_bias: Tensor[(16), float32] /* span=aten___convolution_0_bias:0:0 */) {
  %0 = nn.conv2d(%data, %aten___convolution_0_weight, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* span=aten___convolution_0:0:0 */;
  %1 = nn.bias_add(%0, %aten___convolution_0_bias) /* span=aten___convolution_0:0:0 */;
  nn.relu(%1) /* span=aten__relu_0:0:0 */
}


量化 TVM 模型：

In [4]:
with relay.quantize.qconfig(skip_conv_layers=[]):
    qmod = relay.quantize.quantize(mod, params)
print(qmod["main"])

fn (%data: Tensor[(1, 16, 4, 4), float32] /* ty=Tensor[(1, 16, 4, 4), float32] span=aten___convolution_0_data:0:0 */) -> Tensor[(1, 16, 4, 4), float32] {
  %0 = multiply(%data, 16f /* ty=float32 */) /* ty=Tensor[(1, 16, 4, 4), float32] */;
  %1 = round(%0) /* ty=Tensor[(1, 16, 4, 4), float32] */;
  %2 = clip(%1, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 4, 4), float32] */;
  %3 = cast(%2, dtype="int8") /* ty=Tensor[(1, 16, 4, 4), int8] */;
  %4 = nn.conv2d(%3, meta[relay.Constant][0] /* ty=Tensor[(16, 16, 3, 3), int8] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3], out_dtype="int32") /* ty=Tensor[(1, 16, 4, 4), int32] */;
  %5 = add(%4, 512 /* ty=int32 */) /* ty=Tensor[(1, 16, 4, 4), int32] */;
  %6 = right_shift(%5, 10 /* ty=int32 */) /* ty=Tensor[(1, 16, 4, 4), int32] */;
  %7 = clip(%6, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 4, 4), int32] */;
  %8 = cast(%7, dtype="int8") /* ty=Tensor[(1, 16, 4, 4), int8] */;
  %9 = annotation.stop_fusion(%8) /* ty=Tensor[(1, 1

## 验证结果

In [5]:
dev = tvm.cpu()
data_np = np.random.uniform(low=-1, high=1, size=[1, 16, 4, 4]).astype("float32")
input_dict = {"data": data_np}

量化前结果：

In [6]:
with tvm.transform.PassContext(opt_level=3):
    vm_exec = relay.vm.compile(mod, target="llvm", params=params)
vm = VirtualMachine(vm_exec, dev)
vm.set_input("main", **input_dict)
tvm_res = vm.run()

One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.


量化后结果：

In [7]:
with tvm.transform.PassContext(opt_level=3):
    qvm_exec = relay.vm.compile(qmod, target="llvm", params=params)
qvm = VirtualMachine(qvm_exec, dev)
qvm.set_input("main", **input_dict)
tvm_qres = qvm.run()

对比 Torch 结果与 TVM 浮点结果：

In [8]:
with torch.no_grad():
    torch_res = traced_model(torch.from_numpy(data_np))
np.testing.assert_allclose(
    tvm_res.numpy(), torch_res.numpy(),
    rtol=1e-5, atol=1e-5
)

查看量化前后的余弦相似度与 $L2$ 损失：

In [9]:
from tvm_book.testing.metric import cosine_similarity, l2_loss

In [10]:
(
    cosine_similarity(tvm_res.numpy(), tvm_qres.numpy()), 
    l2_loss(tvm_res.numpy(), tvm_qres.numpy())
)

(0.9970607161521912, 0.00048657518345862627)

## 源码解析

可以打印完整的量化流程：

In [11]:
@tvm.instrument.pass_instrument
class PrintIR:
    def run_before_pass(self, mod, info):
        print(f"运行 pass: {info}")
        print(mod["main"])


with tvm.transform.PassContext(opt_level=3, instruments=[PrintIR()]):
    with relay.quantize.qconfig(skip_conv_layers=[]):
        qmod = relay.quantize.quantize(mod, params)

运行 pass: The meta data of the pass - pass name: sequential, opt_level: 0, required passes: []

fn (%data: Tensor[(1, 16, 4, 4), float32] /* span=aten___convolution_0_data:0:0 */) {
  %0 = nn.conv2d(%data, meta[relay.Constant][0], padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* span=aten___convolution_0:0:0 */;
  %1 = nn.bias_add(%0, meta[relay.Constant][1]) /* span=aten___convolution_0:0:0 */;
  nn.relu(%1) /* span=aten__relu_0:0:0 */
} /* ty=fn (Tensor[(1, 16, 4, 4), float32]) -> meta[IncompleteType][0] */

运行 pass: The meta data of the pass - pass name: InferType, opt_level: 0, required passes: []

fn (%data: Tensor[(1, 16, 4, 4), float32] /* span=aten___convolution_0_data:0:0 */) {
  %0 = nn.conv2d(%data, meta[relay.Constant][0], padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* span=aten___convolution_0:0:0 */;
  %1 = nn.bias_add(%0, meta[relay.Constant][1]) /* span=aten___convolution_0:0:0 */;
  nn.relu(%1) /* span=aten__relu_0:0:0 */
} /* ty=fn (Tensor[(1, 16, 4

### `tvm.relay.qnn.op.qnn.simulated_quantize`

接口见：`tvm/relay/qnn/op/_qnn.py`：

```python
@register_compute("qnn.simulated_quantize")
def simulated_quantize_compute(attrs, inputs, output_type):
    assert len(inputs) == 4
    return [
        topi.nn.simulated_quantize(
            inputs[0], inputs[1], inputs[2], inputs[3], axis=attrs.get_int("axis")
        )
    ]


register_injective_schedule("qnn.simulated_quantize")
register_pattern("qnn.simulated_quantize", OpPattern.ELEMWISE)
```

可以看出真正的实现见：{func}`tvm.topi.nn.simulated_quantize`

In [13]:
tvm.topi.nn.simulated_quantize??

[0;31mSignature:[0m
[0mtvm[0m[0;34m.[0m[0mtopi[0m[0;34m.[0m[0mnn[0m[0;34m.[0m[0msimulated_quantize[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdata[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mout_dtype[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moutput_scale[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moutput_zero_point[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;34m@[0m[0mtvm[0m[0;34m.[0m[0mte[0m[0;34m.[0m[0mtag_scope[0m[0;34m([0m[0mtag[0m[0;34m=[0m[0mtopi[0m[0;34m.[0m[0mtag[0m[0;34m.[0m[0mELEMWISE[0m[0;34m)[0m[0;34m[0m
[0;34m[0m[0;32mdef[0m [0msimulated_quantize[0m[0;34m([0m[0mdata[0m[0;34m,[0m [0mout_dtype[0m[0;34m,[0m [0moutput_scale[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0moutput_zero_point[0m[0;34m=[0m[0;

{func}`tvm.te.tag_scope` 是 TVM 中的函数，用于在编译时为张量运算添加标签。这些标签可以帮助我们更好地理解和调试代码。

示例：

In [14]:
from tvm import te
n = te.var('n')
m = te.var('m')
l = te.var('l')
A = te.placeholder((n, l), name='A')
B = te.placeholder((m, l), name='B')
k = te.reduce_axis((0, l), name='k')

with tvm.te.tag_scope(tag='matmul'):
    C = te.compute((n, m), lambda i, j: te.sum(A[i, k] * B[j, k], axis=k))

# 或者直接用作装饰器
@tvm.te.tag_scope(tag="conv")
def compute_relu(data):
    return te.compute(data.shape, lambda *i: tvm.tir.Select(data(*i) < 0, 0.0, data(*i)))

模拟 QNN 量化算子，可以模仿 QNN 输出而不改变数据类型。与真正的 QNN 量化相比，该算子的优势在于它允许动态选择数据类型，并且可以在通道级别和标量尺度以及零点上操作，而 QNN 量化要求这些在编译时必须固定。

模拟任意整数数据类型的量化。所有数据类型的计算如下：

$$
Q_{output} = \operatorname{clip}((\operatorname{round}(input_{tensor}/output_{scale}) + output_{zero\_point}),
                out\_dtype_{\min},
                out\_dtype_{\max})
$$

逆过程为 {func}`tvm.relay.qnn.op.qnn.simulated_dequantize`，计算公式为：

$$
DQ_{output} = (input - zero\_point) * scale
$$

In [15]:
tvm.relay.qnn.op.qnn.simulated_dequantize

<function tvm.relay.qnn.op.qnn.simulated_dequantize(data, input_scale, input_zero_point, axis=-1, in_dtype='int8')>

In [16]:
tvm.topi.nn.simulated_dequantize

<function tvm.topi.nn.qnn.simulated_dequantize(data, in_dtype, input_scale=None, input_zero_point=None, axis=-1)>

In [17]:
tvm.relay.qnn.op.qnn.requantize??

[0;31mSignature:[0m
[0mtvm[0m[0;34m.[0m[0mrelay[0m[0;34m.[0m[0mqnn[0m[0;34m.[0m[0mop[0m[0;34m.[0m[0mqnn[0m[0;34m.[0m[0mrequantize[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdata[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minput_scale[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minput_zero_point[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moutput_scale[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moutput_zero_point[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrounding[0m[0;34m=[0m[0;34m'None'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcompute_dtype[0m[0;34m=[0m[0;34m'None'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mout_dtype[0m[0;34m=[0m[0;34m'int8'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mrequantize[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdata[0m[0;34m,[0m[0;34m[0m


重量化算子将量化张量表示转换为另一个量化张量表示。对于输出张量，提供了输出 scale 和零点。计算如下：

$$
Q_{output} = zp_{output} +  (scale_{input})/(scale_{output}) * (Q_{input} - zp_{input})
$$