In [1]:
import ppq.lib as PFL
from ppq.api import ENABLE_CUDA_KERNEL, load_onnx_graph
from ppq.core import TargetPlatform
from ppq.executor import TorchExecutor
from ppq.quantization.optim import (LayerwiseEqualizationPass,
                                    LearnedStepSizePass, ParameterQuantizePass,
                                    RuntimeCalibrationPass)

import torch

from Quantizers import MyInt8Quantizer

import tensorrt as trt
from typing import Optional, List, Tuple
from cuda import cuda, cudart

import numpy as np


      ____  ____  __   ____                    __              __
     / __ \/ __ \/ /  / __ \__  ______ _____  / /_____  ____  / /
    / /_/ / /_/ / /  / / / / / / / __ `/ __ \/ __/ __ \/ __ \/ /
   / ____/ ____/ /__/ /_/ / /_/ / /_/ / / / / /_/ /_/ / /_/ / /
  /_/   /_/   /_____\___\_\__,_/\__,_/_/ /_/\__/\____/\____/_/




In [3]:
from model import Net
from data import MyData

BATCH_SIZE_DATA = 128
DATA_PATH = "../data/MNIST/"

onnxFile = "./model.onnx"
ppq_onnxFile = "./model_int8(PPQ)_onnx.onnx"  # 只针对参数量化，表示参数已经被静态量化，当前 config 不生效，数据可以直接使用
int8_scale_file = "./model_int8(PPQ).json"
engine_file = './model_int8(PPQ).engine'

trainDataset = MyData(datapath = DATA_PATH, isTrain = True)

calibLoader = torch.utils.data.DataLoader(dataset=trainDataset, batch_size=BATCH_SIZE_DATA, shuffle=True, collate_fn = lambda x: torch.cat([sample[0].unsqueeze(0) for sample in x], dim=0))

input_shape = next(iter(calibLoader)).shape

In [4]:
graph = load_onnx_graph(onnx_import_file=onnxFile)

In [5]:
quantizer = PFL.Quantizer(platform=TargetPlatform.TRT_INT8, graph=graph)
# quantizer = MyInt8Quantizer(graph=graph)

for name, op in graph.operations.items():
    if op.type in {'Conv', 'ConvTranspose', 'MatMul', 'Gemm', 
                   'PPQBiasFusedMatMul', 'LayerNormalization'}:
        quantizer.quantize_operation(name, platform=TargetPlatform.TRT_INT8)

In [6]:
pipeline = PFL.Pipeline([
            # LayerwiseEqualizationPass(iteration=10),
            ParameterQuantizePass(),
            RuntimeCalibrationPass(),
            # LearnedStepSizePass(
            #     steps=1000, is_scale_trainable=False, 
            #     lr=1e-4, block_size=4, collecting_device='cpu'),
            # ParameterBakingPass()
        ])

In [7]:
 # call pipeline.
executor = TorchExecutor(graph=graph)
executor.tracing_operation_meta(torch.zeros(input_shape).cuda())
executor.load_graph(graph=graph)

In [8]:
pipeline.optimize(
    graph=graph, dataloader=calibLoader, verbose=True,
    calib_steps=8, collate_fn=lambda x: x.to('cuda'), executor=executor)

[09:50:27] PPQ Parameter Quantization Pass Running ... Finished.
[09:50:27] PPQ Runtime Calibration Pass Running ...    

Calibration Progress(Phase 1): 100%|████████████████████████████| 8/8 [00:00<00:00, 16.26it/s]

Finished.





In [9]:
exporter = PFL.Exporter(platform=TargetPlatform.TRT_INT8)
exporter.export(file_path=ppq_onnxFile, graph=graph, config_path=int8_scale_file)

[33m[Info] You are exporting PPQ Graph to TensorRT(Onnx + Json). 
Please Compile the TensorRT INT8 engine manually: 

from ppq.utils.TensorRTUtil import build_engine 
build_engine(onnx_file='Quantized.onnx', int8_scale_file='Quantized.json', engine_file='Quantized.engine', int8=True)
[0m
[33m[Info] Parameters have been saved to file: ./quantized.wts[0m


In [10]:
from ppq.utils.TensorRTUtil import build_engine 
build_engine(onnx_file=ppq_onnxFile, int8_scale_file=int8_scale_file, engine_file=engine_file, int8=True)

[05/31/2023-09:50:35] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[05/31/2023-09:50:35] [TRT] [W] Tensor DataType is determined at build time for tensors not marked as input or output.
[05/31/2023-09:50:35] [TRT] [W] Calibrator is not being used. Users must provide dynamic range for all tensors that are not Int32 or Bool.
[05/31/2023-09:50:35] [TRT] [W] Missing scale and zero-point for tensor /Relu_output_0, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[05/31/2023-09:50:35] [TRT] [W] Missing scale and zero-point for tensor /Relu_1_output_0, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[05/31/2023-09:50:35] [TRT] [W] Missing scale and zero-point for tensor /MaxPool_1_output_0, expect fall back to non-int8 implementation for any layer consuming or producing given ten

## 验证

In [11]:
import common

BATCH_SIZE = 128
DATA_PATH = "../data/MNIST/"

testDataset = MyData(datapath = DATA_PATH, isTrain = False)
testLoader = torch.utils.data.DataLoader(dataset=testDataset, batch_size=BATCH_SIZE, shuffle=False)

In [12]:
logger = trt.Logger(trt.Logger.ERROR) 

with open(engine_file, "rb") as f, trt.Runtime(logger) as runtime, runtime.deserialize_cuda_engine(
    f.read()
) as engine, engine.create_execution_context() as context:
    input_len = 0
    correct = 0
    for idx in range(engine.num_bindings):
        name = engine.get_tensor_name (idx)
        is_input = engine.get_tensor_mode (name)
        if is_input == trt.TensorIOMode.INPUT:
            input_len += 1
        op_type = engine.get_tensor_dtype(name)
        shape = engine.get_tensor_shape(name)
        print('input id:',idx,'   is input: ', is_input,'  binding name:', name, '  shape:', shape, 'type: ', op_type)
        
    for i in range(input_len):
        context.set_binding_shape(i, (128, 1, 28, 28))
    assert context.all_binding_shapes_specified
    
    inputs, outputs, bindings, stream = common.allocate_buffers(engine, context, 0)
    
    for _, (image, label) in enumerate(testLoader):
        image = image.numpy()
        inputs[0].host = image
        preds = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)[0][1]
        label = np.argmax(label.numpy(), axis=-1)
        count = 0
        for i in range(len(label)):
            if label[i] == preds[i]:
                count += 1
        correct += count
    print('\nTest set: Accuracy: {:.3f}%\n'.format(100. * correct / len(testLoader.dataset)))

input id: 0    is input:  TensorIOMode.INPUT   binding name: x   shape: (128, 1, 28, 28) type:  DataType.FLOAT
input id: 1    is input:  TensorIOMode.OUTPUT   binding name: y   shape: (128, 10) type:  DataType.FLOAT
input id: 2    is input:  TensorIOMode.OUTPUT   binding name: z   shape: (128,) type:  DataType.INT32

Test set: Accuracy: 95.700%



  context.set_binding_shape(i, (128, 1, 28, 28))
