In [2]:
import torch
from torch.nn import functional as F
import numpy as np
import onnxruntime as ort
import time
import onnx
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda

### 01. 简化ONNX model
使用onnxsim库，进行常量折叠。

In [11]:
!onnxsim ../bert-base-uncased/model.onnx ../bert-base-uncased/model-sim.onnx --overwrite-input-shape input_ids:1,16 token_type_ids:1,16 attention_mask:1,16 

Simplifying[33m...[0m
Finish! Here is the difference:
┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1m          [0m[1m [0m┃[1m [0m[1mOriginal Model[0m[1m [0m┃[1m [0m[1mSimplified Model[0m[1m [0m┃
┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
│ Add        │ 177            │ 177              │
│ Cast       │ 1              │ 1                │
│ Concat     │ 48             │ [1;32m0               [0m │
│ Constant   │ 596            │ [1;32m213             [0m │
│ Div        │ 51             │ 51               │
│ Erf        │ 13             │ 13               │
│ Gather     │ 100            │ [1;32m2               [0m │
│ MatMul     │ 98             │ 98               │
│ Mul        │ 53             │ 53               │
│ Pow        │ 26             │ 26               │
│ ReduceMean │ 52             │ 52               │
│ Reshape    │ 48             │ 48               │
│ Shape      │ 97             │ [1;32m0               [0m │
│ Slice     

### 02. 验证简化后的ONNX模型

In [24]:
# 读取、设置输入数据
BERT_PATH = '../bert-base-uncased'
npz_file = BERT_PATH + '/case_data.npz'
data = np.load(npz_file)
input_ids = data["input_ids"].astype(np.int64)
attention_mask = np.ones((1, 16), dtype = np.int64)
token_type_ids = np.zeros((1, 16), dtype = np.int64)

In [25]:
# 加载模型
session = ort.InferenceSession("../bert-base-uncased/model-sim.onnx")
# 执行推理
# warmup
for i in range(5):
    outputs1 = session.run(['logits'], {'input_ids': input_ids,
                                    'attention_mask': attention_mask,
                                   'token_type_ids': token_type_ids})[0]
start_time = time.perf_counter()
for i in range(10):
    outputs1 = session.run(['logits'], {'input_ids': input_ids,
                                    'attention_mask': attention_mask,
                                   'token_type_ids': token_type_ids})[0]
end_time = time.perf_counter()

# 检查转换后的模型的精度损失情况
required_precission = 1e-4
precesion_loss = np.abs(outputs1 - data['logits'])
boolean_mask = precesion_loss > required_precission
if(len(np.where(boolean_mask)[0]) > 0):
    print("Simplify ERROR!")
else:
    print("Simplify SUCCESS!!!!!!")
print('*' * 40)
print("onnxruntime with simplified onnx model running time:", (end_time-start_time)*100, "ms")

Simplify SUCCESS!!!!!!
****************************************
onnxruntime with simplified onnx model running time: 0.017777415999807998


### 03. 创建TensorRT Engine

In [15]:
# 创建engine
model_file = "../bert-base-uncased/model-sim.onnx"
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)

config = builder.create_builder_config()

config.max_workspace_size = 512*1024*1024

explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(explicit_batch)
with trt.OnnxParser(network, TRT_LOGGER) as parser:
    with open(model_file, 'rb') as model:
        parsed = parser.parse(model.read())
        print("network.num_layers", network.num_layers)
        #last_layer = network.get_layer(network.num_layers - 1)
        #network.mark_output(last_layer.get_output(0))
        engine = builder.build_engine(network, config=config)
        
        
# save the paln model
BERT_PATH = '../bert-base-uncased'
plan_path = BERT_PATH +'/model.plan'
with open(plan_path, 'wb') as f:
    f.write(engine.serialize())

  config.max_workspace_size = 512*1024*1024


[02/02/2024-19:33:36] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
network.num_layers 1162


  engine = builder.build_engine(network, config=config)


In [28]:
# 也可以使用命令行的方式，执行trtexec进行trt engine的创建
# !trtexec --onnx=../bert-base-uncased/model-sim.onnx --saveEngine=../bert-base-uncased/model.trt  --explicitBatch

&&&& RUNNING TensorRT.trtexec [TensorRT v8601] # trtexec --onnx=bert-base-uncased/model-sim.onnx --saveEngine=bert-base-uncased/model.trt --explicitBatch
[02/02/2024-13:05:32] [W] --explicitBatch flag has been deprecated and has no effect!
[02/02/2024-13:05:32] [W] Explicit batch dim is automatically enabled if input model is ONNX or if dynamic shapes are provided when the engine is built.
[02/02/2024-13:05:32] [I] === Model Options ===
[02/02/2024-13:05:32] [I] Format: ONNX
[02/02/2024-13:05:32] [I] Model: bert-base-uncased/model-sim.onnx
[02/02/2024-13:05:32] [I] Output:
[02/02/2024-13:05:32] [I] === Build Options ===
[02/02/2024-13:05:32] [I] Max batch: explicit batch
[02/02/2024-13:05:32] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[02/02/2024-13:05:32] [I] minTiming: 1
[02/02/2024-13:05:32] [I] avgTiming: 8
[02/02/2024-13:05:32] [I] Precision: FP32
[02/02/2024-13:05:32] [I] LayerPrecisions: 
[02/02/2024-13:05:32] [I] Layer 

### 04. 利用TensorRT执行推理

In [16]:
# 读取engine执行推理
BERT_PATH = 'bert-base-uncased'
plan_path = BERT_PATH +'/model.plan'

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(TRT_LOGGER)
with open(plan_path, 'rb') as f:
    engine_bytes = f.read()
    engine = runtime.deserialize_cuda_engine(engine_bytes)

In [17]:
# 创建执行上下文
bert_context = engine.create_execution_context()

In [18]:
# 重新设置输入数据的格式（因为trt将onnx模型中的int64转换为int32）
input_ids = input_ids.astype(np.int32)
attention_mask = attention_mask.astype(np.int32)
token_type_ids = token_type_ids.astype(np.int32)
bert_output = np.empty((1, 16, 30522), dtype = np.float32)

In [42]:
# 在device上分配内存
d_input_ids = cuda.mem_alloc(input_ids.nbytes)
d_token_type_ids = cuda.mem_alloc(token_type_ids.nbytes)
d_attention_mask = cuda.mem_alloc(attention_mask.nbytes)
d_output = cuda.mem_alloc(bert_output.nbytes)

In [43]:
# 将内存缓冲区 与 执行上下文中的输入输出张量的地址 相绑定
bindings = [int(d_input_ids), int(d_attention_mask),int(d_token_type_ids), int(d_output)]

In [44]:
stream = cuda.Stream()


In [49]:
for i in range(5):
    bert_context.execute_async_v2( bindings, stream.handle, None)
start = time.perf_counter()
for i in range(10):
    # Transfer input data from python buffers to device(GPU)
    cuda.memcpy_htod_async(d_input_ids, input_ids, stream)
    cuda.memcpy_htod_async(d_token_type_ids, token_type_ids, stream)
    cuda.memcpy_htod_async(d_attention_mask, attention_mask, stream)
    bert_context.execute_async_v2( bindings, stream.handle, None)
    cuda.memcpy_dtoh_async(bert_output, d_output, stream)
end = time.perf_counter()
stream.synchronize()
print("tensorrt engine with plan model running time (plus data movement):", (end-start)*100, "ms")

start = time.perf_counter()
for i in range(10):
    bert_context.execute_async_v2( bindings, stream.handle, None)
end = time.perf_counter()
print('*' * 40)
print("tensorrt engine with plan model running time (without movement):", (end-start)*100, "ms")

tensorrt engine with plan model running time (plus data movement): 0.0030427050001890165
****************************************
tensorrt engine with plan model running time (without movement): 0.0003354013999341987


In [36]:
required_precission = 1e-1
precesion_loss = np.abs(bert_output  - data['logits'])
boolean_mask = precesion_loss > required_precission
if(len(np.where(boolean_mask)[0]) > 0):
    print("Simplify ERROR!")
else:
    print(f"Simplify SUCCESS!!!!!!, precision loss:{required_precission}")

Simplify SUCCESS!!!!!!, precision loss:0.1
