In [22]:
import ppq.lib as PFL
from ppq.api import ENABLE_CUDA_KERNEL, load_onnx_graph
from ppq.core import TargetPlatform
from ppq.executor import TorchExecutor
from ppq.quantization.optim import (LayerwiseEqualizationPass,
                                    LearnedStepSizePass, ParameterQuantizePass,
                                    RuntimeCalibrationPass)

import torch

from Quantizers import MyInt8Quantizer

In [15]:
from model import Net
from data import MyData

BATCH_SIZE_DATA = 128
DATA_PATH = "../data/MNIST/"
onnxFile = "./model.onnx"
ppq_onnxFile = "./model_int8(PPQ).onnx"
int8_scale_file = "./model_int8(PPQ).json"
engine_file = './model_int8(PPQ).engine'

trainDataset = MyData(datapath = DATA_PATH, isTrain = True)

calibLoader = torch.utils.data.DataLoader(dataset=trainDataset, batch_size=BATCH_SIZE_DATA, shuffle=True, collate_fn = lambda x: torch.cat([sample[0].unsqueeze(0) for sample in x], dim=0))

input_shape = next(iter(calibLoader)).shape

In [16]:
graph = load_onnx_graph(onnx_import_file=onnxFile)

In [28]:
quantizer = PFL.Quantizer(platform=TargetPlatform.TRT_INT8, graph=graph)
# quantizer = MyInt8Quantizer(graph=graph)

for name, op in graph.operations.items():
    if op.type in {'Conv', 'ConvTranspose', 'MatMul', 'Gemm', 
                   'PPQBiasFusedMatMul', 'LayerNormalization'}:
        quantizer.quantize_operation(name, platform=TargetPlatform.TRT_INT8)

In [29]:
pipeline = PFL.Pipeline([
            # LayerwiseEqualizationPass(iteration=10),
            ParameterQuantizePass(),
            RuntimeCalibrationPass(),
            # LearnedStepSizePass(
            #     steps=1000, is_scale_trainable=False, 
            #     lr=1e-4, block_size=4, collecting_device='cpu'),
            # ParameterBakingPass()
        ])

In [30]:
 # call pipeline.
executor = TorchExecutor(graph=graph)
executor.tracing_operation_meta(torch.zeros(input_shape).cuda())
executor.load_graph(graph=graph)

In [31]:
pipeline.optimize(
    graph=graph, dataloader=calibLoader, verbose=True,
    calib_steps=100, collate_fn=lambda x: x.to('cuda'), executor=executor)

[02:34:10] PPQ Parameter Quantization Pass Running ... Finished.
[02:34:11] PPQ Runtime Calibration Pass Running ...    

Calibration Progress(Phase 1): 100%|████████████████████████| 100/100 [00:06<00:00, 15.74it/s]

Finished.





In [32]:
exporter = PFL.Exporter(platform=TargetPlatform.TRT_INT8)
exporter.export(file_path=ppq_onnxFile, graph=graph, config_path=int8_scale_file)

[33m[Info] You are exporting PPQ Graph to TensorRT(Onnx + Json). 
Please Compile the TensorRT INT8 engine manually: 

from ppq.utils.TensorRTUtil import build_engine 
build_engine(onnx_file='Quantized.onnx', int8_scale_file='Quantized.json', engine_file='Quantized.engine', int8=True)
[0m
[33m[Info] Parameters have been saved to file: ./quantized.wts[0m


In [33]:
from ppq.utils.TensorRTUtil import build_engine 
build_engine(onnx_file=ppq_onnxFile, int8_scale_file=int8_scale_file, engine_file=engine_file, int8=True)

[05/31/2023-02:34:52] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[05/31/2023-02:34:52] [TRT] [W] Tensor DataType is determined at build time for tensors not marked as input or output.
[05/31/2023-02:34:52] [TRT] [W] Calibrator is not being used. Users must provide dynamic range for all tensors that are not Int32 or Bool.
[05/31/2023-02:34:52] [TRT] [W] Missing scale and zero-point for tensor /Relu_output_0, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[05/31/2023-02:34:52] [TRT] [W] Missing scale and zero-point for tensor /Relu_1_output_0, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[05/31/2023-02:34:52] [TRT] [W] Missing scale and zero-point for tensor /MaxPool_1_output_0, expect fall back to non-int8 implementation for any layer consuming or producing given ten

## 验证