In [1]:
import torch
from torch.nn import functional as F
import numpy as np
import os
from transformers import BertTokenizer, BertForMaskedLM
import transformers
# from tqdm import tqdm
import onnxruntime as ort
import time
import onnx
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda

  from .autonotebook import tqdm as notebook_tqdm


### 1. 测试 Bert Model
1. 初始化tokenizer和Bert model，设置用于测试的text
2. 基于pytorch执行bert推理，输出概率最高的10个词
3. 保存输出信息，用来和之后转换过的模型进行对比


In [3]:
BERT_PATH = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)
model = BertForMaskedLM.from_pretrained(BERT_PATH, return_dict = True)
text = "The capital of France, " + tokenizer.mask_token + ", contains the Eiffel Tower."

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
encoded_input = tokenizer.encode_plus(text, return_tensors = "pt")
mask_index = torch.where(encoded_input["input_ids"][0] == tokenizer.mask_token_id)
print("input ids: \n",encoded_input["input_ids"])

# warm up
for i in range(5):
    output = model(**encoded_input)
start_time = time.perf_counter()
# 计算平均推理时间
for i in range(10):
    output = model(**encoded_input)
end_time = time.perf_counter()

print("output shape: ", output[0].shape)
logits = output.logits
softmax = F.softmax(logits, dim = -1)
mask_word = softmax[0, mask_index, :]
top_10 = torch.topk(mask_word, 10, dim = 1)[1][0]
print("model test topk10 output:")
for token in top_10:
    word = tokenizer.decode([token])
    new_sentence = text.replace(tokenizer.mask_token, word)
    print(new_sentence)
print('*' * 40)
print("pytorch with bin model running time:", (end_time-start_time)/10)

input ids: 
 tensor([[  101,  1996,  3007,  1997,  2605,  1010,   103,  1010,  3397,  1996,
          1041, 13355,  2884,  3578,  1012,   102]])
output shape:  torch.Size([1, 16, 30522])
model test topk10 output:
The capital of France, paris, contains the Eiffel Tower.
The capital of France, lyon, contains the Eiffel Tower.
The capital of France, lille, contains the Eiffel Tower.
The capital of France, toulouse, contains the Eiffel Tower.
The capital of France, marseille, contains the Eiffel Tower.
The capital of France, orleans, contains the Eiffel Tower.
The capital of France, strasbourg, contains the Eiffel Tower.
The capital of France, nice, contains the Eiffel Tower.
The capital of France, cannes, contains the Eiffel Tower.
The capital of France, versailles, contains the Eiffel Tower.
****************************************
pytorch with bin model running time: 0.022047897599986755


In [52]:
encoded_input['input_ids'].dtype

torch.int64

In [12]:
logits[0]

tensor([[ -6.6462,  -6.6775,  -6.6606,  ...,  -5.9660,  -5.7844,  -4.1951],
        [-14.7222, -15.2151, -15.0513,  ..., -13.5289, -11.3960, -14.5610],
        [-10.1223, -10.7297, -10.1163,  ...,  -9.2822,  -7.6954, -15.4930],
        ...,
        [-10.7090, -11.2617, -10.9946,  ...,  -8.4995,  -9.6521, -14.2806],
        [-12.2987, -12.0131, -12.5270,  ..., -10.8341, -11.2091,  -5.0134],
        [-12.7292, -13.4996, -13.1655,  ..., -13.2183, -10.6310, -12.8908]],
       grad_fn=<SelectBackward0>)

In [11]:
# save inputs and output
print("Saving inputs and output to case_data.npz ...")
position_ids = torch.arange(0, encoded_input['input_ids'].shape[1]).int().view(1, -1)
print("position id: ",position_ids)
input_ids=encoded_input['input_ids'].int().detach().numpy()
token_type_ids=encoded_input['token_type_ids'].int().detach().numpy()
print("input_id shape: ",input_ids.shape)
# save data
npz_file = BERT_PATH + '/case_data.npz'
np.savez(npz_file,
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            logits=output[0].detach().numpy())

data = np.load(npz_file)
print("saved input ids: \n", data['input_ids'])


Saving inputs and output to case_data.npz ...
position id:  tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]],
       dtype=torch.int32)
input_id shape:  (1, 16)
saved input ids: 
 [[  101  1996  3007  1997  2605  1010   103  1010  3397  1996  1041 13355
   2884  3578  1012   102]]


### 2. 将模型转换为ONNX格式
使用torch.onnx.export() 进行转换

In [10]:
# convert model to onnx
model.eval()
export_model_path = BERT_PATH + "/model.onnx"
opset_version = 16
symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
torch.onnx.export(  model,                                            
                    args=tuple(encoded_input.values()),               # model input (or a tuple for multiple inputs)
                    f=export_model_path,                              # where to save the model (can be a file or file-like object)
                    opset_version=opset_version,                      # the ONNX version to export the model to
                    do_constant_folding=False,                        # whether to execute constant folding for optimization
                    input_names=['input_ids',                         # the model's input names
                                'attention_mask',
                                'token_type_ids'],
                    output_names=['logits'],                          # the model's output names
                    dynamic_axes={'input_ids': symbolic_names,        # variable length axes
                                'attention_mask' : symbolic_names,
                                'token_type_ids' : symbolic_names,
                                'logits' : symbolic_names})
print("Model exported at ", export_model_path)


Model exported at  bert-base-uncased/model.onnx


### 3. 使用onnxruntime进行onnx推理
与pytorch和tensorrt的推理时间相对比

In [15]:
# 检查设备是否为GPU
print("onnxruntime version:", ort.__version__)
print("onnxruntime device:", ort.get_device())

onnxruntime version: 1.16.3
onnxruntime device: GPU


In [65]:
# 加载模型
session = ort.InferenceSession(export_model_path)
# 执行推理
# warmup
for i in range(5):
    outputs = session.run(['logits'], {'input_ids': encoded_input['input_ids'].numpy(),
                                    'attention_mask': encoded_input['attention_mask'].numpy(),
                                   'token_type_ids': encoded_input['token_type_ids'].numpy()})[0]
start_time = time.perf_counter()
for i in range(10):
    outputs = session.run(['logits'], {'input_ids': encoded_input['input_ids'].numpy(),
                                    'attention_mask': encoded_input['attention_mask'].numpy(),
                                   'token_type_ids': encoded_input['token_type_ids'].numpy()})[0]
end_time = time.perf_counter()

# 检查转换后的模型的精度损失情况
required_precission = 1e-4
precesion_loss = np.abs(outputs - data['logits'])
boolean_mask = precesion_loss > required_precission
if(len(np.where(boolean_mask)[0]) > 0):
    print("Convert ERROR!")
else:
    print("Convert SUCCESS!!!!!!")
print('*' * 40)
print("pytorch with bin model running time:", (end_time-start_time)/10)

Convert SUCCESS!!!!!!
****************************************
pytorch with bin model running time: 0.02029021099997408


In [18]:
outputs[0]

array([[ -6.6461678,  -6.677546 ,  -6.6606207, ...,  -5.966043 ,
         -5.7843575,  -4.195074 ],
       [-14.722224 , -15.215152 , -15.051269 , ..., -13.528884 ,
        -11.39604  , -14.560963 ],
       [-10.122324 , -10.729722 , -10.11626  , ...,  -9.282214 ,
         -7.695395 , -15.492979 ],
       ...,
       [-10.708985 , -11.261727 , -10.994601 , ...,  -8.499443 ,
         -9.652082 , -14.280533 ],
       [-12.298677 , -12.013136 , -12.527017 , ..., -10.834092 ,
        -11.209141 ,  -5.0133815],
       [-12.729224 , -13.499596 , -13.165469 , ..., -13.21833  ,
        -10.630962 , -12.8908415]], dtype=float32)

In [9]:
encoded_input['input_ids'].numpy().dtype

dtype('int64')

### 简化ONNX model
使用onnxsim库，进行常量折叠。

In [11]:
!onnxsim bert-base-uncased/model.onnx bert-base-uncased/model-sim.onnx --overwrite-input-shape input_ids:1,16 token_type_ids:1,16 attention_mask:1,16 

Simplifying[33m...[0m
Finish! Here is the difference:
┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1m          [0m[1m [0m┃[1m [0m[1mOriginal Model[0m[1m [0m┃[1m [0m[1mSimplified Model[0m[1m [0m┃
┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
│ Add        │ 177            │ 177              │
│ Cast       │ 1              │ 1                │
│ Concat     │ 48             │ [1;32m0               [0m │
│ Constant   │ 596            │ [1;32m213             [0m │
│ Div        │ 51             │ 51               │
│ Erf        │ 13             │ 13               │
│ Gather     │ 100            │ [1;32m2               [0m │
│ MatMul     │ 98             │ 98               │
│ Mul        │ 53             │ 53               │
│ Pow        │ 26             │ 26               │
│ ReduceMean │ 52             │ 52               │
│ Reshape    │ 48             │ 48               │
│ Shape      │ 97             │ [1;32m0               [0m │
│ Slice     

In [12]:
# 加载模型
session = ort.InferenceSession("bert-base-uncased/model-sim.onnx")
# 执行推理
# warmup
for i in range(5):
    outputs1 = session.run(['logits'], {'input_ids': encoded_input['input_ids'].numpy(),
                                    'attention_mask': encoded_input['attention_mask'].numpy(),
                                   'token_type_ids': encoded_input['token_type_ids'].numpy()})[0]
start_time = time.perf_counter()
for i in range(10):
    outputs1 = session.run(['logits'], {'input_ids': encoded_input['input_ids'].numpy(),
                                    'attention_mask': encoded_input['attention_mask'].numpy(),
                                   'token_type_ids': encoded_input['token_type_ids'].numpy()})[0]
end_time = time.perf_counter()

# 检查转换后的模型的精度损失情况
required_precission = 1e-4
precesion_loss = np.abs(outputs1 - data['logits'])
boolean_mask = precesion_loss > required_precission
if(len(np.where(boolean_mask)[0]) > 0):
    print("Simplify ERROR!")
else:
    print("Simplify SUCCESS!!!!!!")
print('*' * 40)
print("pytorch with bin model running time:", (end_time-start_time)/10)

Simplify SUCCESS!!!!!!
****************************************
pytorch with bin model running time: 0.01595932879990869


In [12]:
# def build_engine(model_file, max_ws=512*1024*1024, fp16=False):
#     print("building engine")
#     TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
#     builder = trt.Builder(TRT_LOGGER)

#     config = builder.create_builder_config()
#     if fp16:
#         config.set_flag(trt.BuilderFlag.FP16)
#     config.max_workspace_size = max_ws
    
#     explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
#     network = builder.create_network(explicit_batch)
#     with trt.OnnxParser(network, TRT_LOGGER) as parser:
#         with open(model_file, 'rb') as model:
#             parsed = parser.parse(model.read())
#             print("network.num_layers", network.num_layers)
#             #last_layer = network.get_layer(network.num_layers - 1)
#             #network.mark_output(last_layer.get_output(0))
#             engine = builder.build_engine(network, config=config)
#             return engine
            
# engine = build_engine("bert-base-uncased/model-sim.onnx")
# # save the paln model
# BERT_PATH = 'bert-base-uncased'
# plan_path = BERT_PATH +'/model.plan'
# with open(plan_path, 'wb') as f:
#     f.write(engine.serialize())

building engine


  config.max_workspace_size = max_ws


[02/02/2024-13:01:10] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
network.num_layers 1162


  engine = builder.build_engine(network, config=config)


In [28]:
# 创建engine
model_file = "bert-base-uncased/model-sim.onnx"
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)

config = builder.create_builder_config()

config.max_workspace_size = 512*1024*1024

explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(explicit_batch)
with trt.OnnxParser(network, TRT_LOGGER) as parser:
    with open(model_file, 'rb') as model:
        parsed = parser.parse(model.read())
        print("network.num_layers", network.num_layers)
        #last_layer = network.get_layer(network.num_layers - 1)
        #network.mark_output(last_layer.get_output(0))
        engine = builder.build_engine(network, config=config)
        
        
# save the paln model
# BERT_PATH = 'bert-base-uncased'
# plan_path = BERT_PATH +'/model.plan'
# with open(plan_path, 'wb') as f:
#     f.write(engine.serialize())

  config.max_workspace_size = 512*1024*1024


[02/02/2024-17:12:40] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
network.num_layers 1162


  engine = builder.build_engine(network, config=config)


In [15]:
for idx in range(engine.num_bindings):
    name = engine.get_tensor_name (idx)
    is_input = engine.get_tensor_mode (name)
    op_type = engine.get_tensor_dtype(name)
    shape = engine.get_tensor_shape(name)

    print('input id:',idx,'   is input: ', is_input,'  binding name:', name, '  shape:', shape, 'type: ', op_type)


input id: 0    is input:  TensorIOMode.INPUT   binding name: input_ids   shape: (1, 16) type:  DataType.INT32
input id: 1    is input:  TensorIOMode.INPUT   binding name: attention_mask   shape: (1, 16) type:  DataType.INT32
input id: 2    is input:  TensorIOMode.INPUT   binding name: token_type_ids   shape: (1, 16) type:  DataType.INT32
input id: 3    is input:  TensorIOMode.OUTPUT   binding name: logits   shape: (1, 16, 30522) type:  DataType.FLOAT


In [28]:
# !trtexec --onnx=bert-base-uncased/model-sim.onnx --saveEngine=bert-base-uncased/model.trt  --explicitBatch

&&&& RUNNING TensorRT.trtexec [TensorRT v8601] # trtexec --onnx=bert-base-uncased/model-sim.onnx --saveEngine=bert-base-uncased/model.trt --explicitBatch
[02/02/2024-13:05:32] [W] --explicitBatch flag has been deprecated and has no effect!
[02/02/2024-13:05:32] [W] Explicit batch dim is automatically enabled if input model is ONNX or if dynamic shapes are provided when the engine is built.
[02/02/2024-13:05:32] [I] === Model Options ===
[02/02/2024-13:05:32] [I] Format: ONNX
[02/02/2024-13:05:32] [I] Model: bert-base-uncased/model-sim.onnx
[02/02/2024-13:05:32] [I] Output:
[02/02/2024-13:05:32] [I] === Build Options ===
[02/02/2024-13:05:32] [I] Max batch: explicit batch
[02/02/2024-13:05:32] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[02/02/2024-13:05:32] [I] minTiming: 1
[02/02/2024-13:05:32] [I] avgTiming: 8
[02/02/2024-13:05:32] [I] Precision: FP32
[02/02/2024-13:05:32] [I] LayerPrecisions: 
[02/02/2024-13:05:32] [I] Layer 

In [16]:
# BERT_PATH = 'bert-base-uncased'
# plan_path = BERT_PATH +'/model.plan'

# TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
# runtime = trt.Runtime(TRT_LOGGER)
# with open(plan_path, 'rb') as f:
#     engine_bytes = f.read()
#     engine = runtime.deserialize_cuda_engine(engine_bytes)

In [29]:
bert_context = engine.create_execution_context()

In [30]:
input_ids=encoded_input['input_ids'].numpy().astype(np.int32)
attention_mask = encoded_input['attention_mask'].numpy().astype(np.int32)
token_type_ids = encoded_input['token_type_ids'].numpy().astype(np.int32)
bert_output = np.empty((1, 16, 30522), dtype = np.float32)

In [40]:
print(encoded_input['input_ids'])
print(input_ids)

tensor([[  101,  1996,  3007,  1997,  2605,  1010,   103,  1010,  3397,  1996,
          1041, 13355,  2884,  3578,  1012,   102]])
[[  101  1996  3007  1997  2605  1010   103  1010  3397  1996  1041 13355
   2884  3578  1012   102]]


In [31]:
print(input_ids.dtype, input_ids.shape)
print(attention_mask.dtype, attention_mask.shape)
print(token_type_ids.dtype, token_type_ids.shape)
print(bert_output.dtype, bert_output.shape)

int32 (1, 16)
int32 (1, 16)
int32 (1, 16)
float32 (1, 16, 30522)


In [32]:

d_input_ids = cuda.mem_alloc(input_ids.nbytes)
d_token_type_ids = cuda.mem_alloc(token_type_ids.nbytes)
d_attention_mask = cuda.mem_alloc(attention_mask.nbytes)


In [33]:
d_output = cuda.mem_alloc(bert_output.nbytes)

In [41]:
bindings = [int(d_input_ids), int(d_attention_mask),int(d_token_type_ids), int(d_output)]

In [42]:
stream = cuda.Stream()
# Transfer input data from python buffers to device(GPU)
cuda.memcpy_htod_async(d_input_ids, input_ids, stream)
cuda.memcpy_htod_async(d_token_type_ids, token_type_ids, stream)
cuda.memcpy_htod_async(d_attention_mask, attention_mask, stream)

In [66]:
for i in range(5):
    bert_context.execute_async_v2( bindings, stream.handle, None)
start = time.perf_counter()
for i in range(100):
    bert_context.execute_async_v2( bindings, stream.handle, None)
end = time.perf_counter()
print((end-start)/100)

0.0018574402800004464


In [44]:
cuda.memcpy_dtoh_async(bert_output, d_output, stream)
stream.synchronize()

In [45]:
pred = torch.tensor(bert_output)
softmax = F.softmax(pred, dim = -1)
mask_word = softmax[0, mask_index, :]
top_10 = torch.topk(mask_word, 10, dim = 1)[1][0]
print("model test topk10 output:")
for token in top_10:
    word = tokenizer.decode([token])
    new_sentence = text.replace(tokenizer.mask_token, word)
    print(new_sentence)

model test topk10 output:
The capital of France, paris, contains the Eiffel Tower.
The capital of France, lyon, contains the Eiffel Tower.
The capital of France, lille, contains the Eiffel Tower.
The capital of France, toulouse, contains the Eiffel Tower.
The capital of France, marseille, contains the Eiffel Tower.
The capital of France, orleans, contains the Eiffel Tower.
The capital of France, strasbourg, contains the Eiffel Tower.
The capital of France, nice, contains the Eiffel Tower.
The capital of France, cannes, contains the Eiffel Tower.
The capital of France, versailles, contains the Eiffel Tower.


In [46]:
pred 

tensor([[[ -6.6460,  -6.6773,  -6.6604,  ...,  -5.9658,  -5.7841,  -4.1949],
         [-14.7249, -15.2179, -15.0539,  ..., -13.5311, -11.3981, -14.5633],
         [-10.1212, -10.7288, -10.1155,  ...,  -9.2821,  -7.6942, -15.4952],
         ...,
         [-10.7082, -11.2609, -10.9939,  ...,  -8.4998,  -9.6510, -14.2810],
         [-12.2971, -12.0116, -12.5240,  ..., -10.8314, -11.2067,  -5.0205],
         [-12.7315, -13.5021, -13.1680,  ..., -13.2217, -10.6320, -12.8934]]])

In [54]:
required_precission = 1e-1
precesion_loss = np.abs(pred  - data['logits'])
boolean_mask = precesion_loss > required_precission

np.where(boolean_mask)[0] > 0
if(len(np.where(boolean_mask)[0]) > 0):
    print("Simplify ERROR!")
else:
    print("Simplify SUCCESS!!!!!!")

Simplify SUCCESS!!!!!!


tensor([[0, 0, 0,  ..., 0, 0, 0]])