## Onnx2Trt
###### https://zhuanlan.zhihu.com/p/548006090

In [None]:
import os
import sys
sys.path.append("..")

In [None]:
import tensorrt as trt
from common import EXPLICIT_BATCH
from common import allocate_buffers, do_inference_v2

In [None]:
TRT_LOGGER = trt.Logger()

In [None]:
def build_engine(
    onnx_path, 
    out_trt_path, 
    max_batch_size=1, 
    mode='fp32', 
    calib=None):
    ''' convert onnx to tensorrt engine, use mode of ['fp32', 'fp16', 'int8']
        : engine: 推理用到的模型
        : builder: 用来构建engine
        : config:
        : parser: 用来解析onnx文件
    : return: trt engine
    '''
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(EXPLICIT_BATCH)
    config = builder.create_builder_config()
    parser = trt.OnnxParser(network, TRT_LOGGER)
    
    config.max_workspace_size = 1 << 30  # 最大显存占用
    builder.max_batch_size = max_batch_size  # 推理的时候要保证batch_size<=max_batch_size
    
    if mode == 'int8':
        assert (builder.platform_has_fast_int8 == True), "not support int8"
        builder.int8_mode = True
        builder.int8_calibrator = calib
    elif mode == 'fp16':
        assert (builder.platform_has_fast_fp16 == True), "not support fp16"
        builder.fp16_mode = True
    
    # parse model file
    print('Loading ONNX file from path {}...'.format(onnx_path))
    with open(onnx_path, 'rb') as onnx_model:
        print('Beginning ONNX file parsing')
        parser.parse(onnx_model.read())
    print('Completed parsing of ONNX file')
    
    # Dynamic input setting
    network.get_input(0).shape=[-1, 3, -1, -1]
    # 为每个动态输入绑定一个profile
    # 设置最小的尺寸, 常用的尺寸, 最大的尺寸, 推理时候输入需要在这个范围内
    profile = builder.create_optimization_profile()
    profile.set_shape(
        network.get_input(0).name, 
        (1, 3, 512, 512), 
        (1, 3, 1024, 1024), 
        (1, 3, 2048, 2048))
    config.add_optimization_profile(profile)
    
    # build engine
    print('Building an engine from file {}; this may take a while...'.format(onnx_path))
    engine = builder.build_engine(network, config)
    print("Created engine success! ")
    
    # save trt model
    print('Saving TRT engine file to path {}...'.format(out_trt_path))
    with open(out_trt_path, "wb") as f:
        f.write(engine.serialize())   # 序列化（编码）-> 文件
    print('Engine file has already saved to {}!'.format(out_trt_path))
    return engine

In [None]:
def get_engine(trt_path):
    print(f'Reading engine from file {trt_path}')
    runtime = trt.Runtime(TRT_LOGGER)
    with open(trt_path,'rb') as f:
        return runtime.deserialize_cuda_engine(f.read())  # 反序列化（解码）-> 模型

In [None]:
# config
onnx_path = ".../models/onnx/db_r18_op10_sim.onnx"
out_trt_path = ".../models/trt/db_r18.trt"
max_batch_size = 1
mode = 'fp32'    # ['fp32', 'fp16', 'int8']
if mode == 'int8':
    # Note that: if use int8 mode, you should prepare a calibrate dataset and create a Calibrator class.
    # In Calibrator class, you should override 'get_batch_size, get_batch',
    # 'read_calibration_cache', 'write_calibration_cache'.
    # You can reference implementation of CenterNetEntropyCalibrator.
    calib = CustomEntropyCalibrator()  # TODO:自定义
else:
    calib = None

In [None]:
# convert to trt engine
build_engine(onnx_path, out_trt_path, max_batch_size=max_batch_size, mode=mode)

## 验证模型

In [None]:
# load packages
import cv2
import numpy as np
import torch
from pathlib import Path
import time

from pytocr.modeling.architectures import build_model
from pytocr.utils.save_load import load_pretrained_params
from utils import load_config, draw_det_res

In [None]:
# choose test image
img_path = r".../test_img.png"
img = cv2.imread(img_path, cv2.IMREAD_COLOR)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
resized_img = cv2.resize(img, (960, 736))
image_data = np.array(resized_img, dtype='float32')
image_data /= 255.
image_data = np.transpose(image_data, (2, 0, 1))  # C H W

In [None]:
# Torch model config
config_path = ".../PyTorchOCR/configs/det/det_r18_db.yml"
model_path = ".../PyTorchOCR/models/torch/db_r18.pth"

In [None]:
config = load_config(config_path)
config["Global"]["distributed"] = False

# build model
model = build_model(config["Architecture"])
# check if set use_gpu=True in paddlepaddle cpu version
use_gpu = config["Global"]["use_gpu"] and torch.cuda.is_available()
device = torch.device("cuda:0" if use_gpu else "cpu")
model = model.to(device)
model.eval()
model = load_pretrained_params(model, model_path)
# print(model)

In [None]:
# Torch infer
torch_input = torch.from_numpy(image_data).unsqueeze(0)  # N C H W

st_time = time.time()
with torch.no_grad():
    torch_input = torch_input.to(device)
    print(torch_input.shape, torch_input.dtype)
    torch_preds = model(torch_input)["maps"].cpu().numpy()
print("torch infer cost time", time.time() - st_time)
print(torch_preds.shape)

In [None]:
# load trt engine
trt_path = out_trt_path
engine = get_engine(trt_path)

In [None]:
# Trt infer
trt_input = np.expand_dims(image_data, 0)
trt_input = np.ascontiguousarray(trt_input)
height, width = trt_input.shape[-2:]

context = engine.create_execution_context()
# 修改allocate_buffers函数,支持动态输入
inputs, outputs, bindings, stream = allocate_buffers(engine, (height, width))
# 生成engine时指定了多个optimization profile，
# 在实际使用的时候，必须指定使用哪个profile
# profile是按照递增进行编码的。
context.active_optimization_profile = 0  # 新增部分
origin_inputshape = context.get_binding_shape(0)
if origin_inputshape[-1] == -1:
    origin_inputshape[-2], origin_inputshape[-1] = (height, width)
    context.set_binding_shape(0, (origin_inputshape))

print(f'Running inference on image {img_path}...')
st_time = time.time()
inputs[0].host = trt_input
trt_preds = do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)[0]
trt_preds = np.reshape(trt_preds, (max_batch_size, 1, height, width))[0]
print("trt infer cost time", time.time() - st_time)
print(trt_preds.shape)
# trt_outputs = np.reshape(trt_outputs,(height,width))

In [None]:
# 计算输出差异
diff = trt_preds - torch_preds
print("difference between onnx and torch: ", max(diff.reshape(-1)))