In [2]:
import torch
from torch.nn import functional as F
import numpy as np
import os
from transformers import BertTokenizer, BertForMaskedLM
import transformers
# from tqdm import tqdm
import onnxruntime as ort
import time
import onnx
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda

  from .autonotebook import tqdm as notebook_tqdm


### 1. 测试 Bert Model
1. 初始化tokenizer和Bert model，设置用于测试的text
2. 基于pytorch执行bert推理，输出概率最高的10个词
3. 保存输出信息，用来和之后转换过的模型进行对比


In [6]:
BERT_PATH = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)
model = BertForMaskedLM.from_pretrained(BERT_PATH, return_dict = True)
text = "The capital of France, " + tokenizer.mask_token + ", contains the Eiffel Tower."

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
encoded_input = tokenizer.encode_plus(text, return_tensors = "pt")
mask_index = torch.where(encoded_input["input_ids"][0] == tokenizer.mask_token_id)
print("input ids: \n",encoded_input["input_ids"])

# warm up
for i in range(5):
    output = model(**encoded_input)
start_time = time.perf_counter()
for i in range(10):
    output = model(**encoded_input)
end_time = time.perf_counter()

print("output shape: ", output[0].shape)
logits = output.logits
softmax = F.softmax(logits, dim = -1)
mask_word = softmax[0, mask_index, :]
top_10 = torch.topk(mask_word, 10, dim = 1)[1][0]
print("model test topk10 output:")
for token in top_10:
    word = tokenizer.decode([token])
    new_sentence = text.replace(tokenizer.mask_token, word)
    print(new_sentence)
print('*' * 40)
print("pytorch with bin model running time:", (end_time-start_time)/10)

input ids: 
 tensor([[  101,  1996,  3007,  1997,  2605,  1010,   103,  1010,  3397,  1996,
          1041, 13355,  2884,  3578,  1012,   102]])
output shape:  torch.Size([1, 16, 30522])
model test topk10 output:
The capital of France, paris, contains the Eiffel Tower.
The capital of France, lyon, contains the Eiffel Tower.
The capital of France, lille, contains the Eiffel Tower.
The capital of France, toulouse, contains the Eiffel Tower.
The capital of France, marseille, contains the Eiffel Tower.
The capital of France, orleans, contains the Eiffel Tower.
The capital of France, strasbourg, contains the Eiffel Tower.
The capital of France, nice, contains the Eiffel Tower.
The capital of France, cannes, contains the Eiffel Tower.
The capital of France, versailles, contains the Eiffel Tower.
****************************************
pytorch with bin model running time: 0.02531918330005283


In [9]:
# save inputs and output
print("Saving inputs and output to case_data.npz ...")
position_ids = torch.arange(0, encoded_input['input_ids'].shape[1]).int().view(1, -1)
print("position id: ",position_ids)
input_ids=encoded_input['input_ids'].int().detach().numpy()
token_type_ids=encoded_input['token_type_ids'].int().detach().numpy()
print("input_id shape: ",input_ids.shape)
# save data
npz_file = BERT_PATH + '/case_data.npz'
np.savez(npz_file,
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            logits=output[0].detach().numpy())

data = np.load(npz_file)
print("saved input ids: \n", data['input_ids'])


Saving inputs and output to case_data.npz ...
position id:  tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]],
       dtype=torch.int32)
input_id shape:  (1, 16)
saved input ids: 
 [[  101  1996  3007  1997  2605  1010   103  1010  3397  1996  1041 13355
   2884  3578  1012   102]]


### 2. 将模型转换为ONNX格式
使用torch.onnx.export() 进行转换

In [49]:
# convert model to onnx
model.eval()
export_model_path = BERT_PATH + "/model.onnx"
opset_version = 12
symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
torch.onnx.export(  model,                                            
                    args=tuple(encoded_input.values()),               # model input (or a tuple for multiple inputs)
                    f=export_model_path,                              # where to save the model (can be a file or file-like object)
                    opset_version=opset_version,                      # the ONNX version to export the model to
                    do_constant_folding=False,                        # whether to execute constant folding for optimization
                    input_names=['input_ids',                         # the model's input names
                                'attention_mask',
                                'token_type_ids'],
                    output_names=['logits'],                          # the model's output names
                    dynamic_axes={'input_ids': symbolic_names,        # variable length axes
                                'attention_mask' : symbolic_names,
                                'token_type_ids' : symbolic_names,
                                'logits' : symbolic_names})
print("Model exported at ", export_model_path)


Model exported at  bert-base-uncased/model.onnx


### 3. 使用onnxruntime进行onnx推理
与pytorch和tensorrt的推理时间相对比

In [51]:
# 检查设备是否为GPU
print("onnxruntime version:", ort.__version__)
print("onnxruntime device:", ort.get_device())

onnxruntime version: 1.16.3
onnxruntime device: GPU


In [112]:
# 加载模型
session = ort.InferenceSession(export_model_path)
# 执行推理
# warmup
for i in range(5):
    outputs = session.run(['logits'], {'input_ids': encoded_input['input_ids'].numpy(),
                                    'attention_mask': encoded_input['attention_mask'].numpy(),
                                   'token_type_ids': encoded_input['token_type_ids'].numpy()})[0]
start_time = time.perf_counter()
for i in range(10):
    outputs = session.run(['logits'], {'input_ids': encoded_input['input_ids'].numpy(),
                                    'attention_mask': encoded_input['attention_mask'].numpy(),
                                   'token_type_ids': encoded_input['token_type_ids'].numpy()})[0]
end_time = time.perf_counter()

# 检查转换后的模型的精度损失情况
required_precission = 1e-4
precesion_loss = np.abs(outputs - data['logits'])
boolean_mask = precesion_loss > required_precission
if(len(np.where(boolean_mask)[0]) > 0):
    print("Convert ERROR!")
else:
    print("Convert SUCCESS!!!!!!")
print('*' * 40)
print("pytorch with bin model running time:", (end_time-start_time)/10)

Convert SUCCESS!!!!!!
****************************************
pytorch with bin model running time: 0.017354312099632806


### 简化ONNX model
使用onnxsim库，进行常量折叠。

In [None]:
!onnxsim bert-base-uncased/model.onnx bert-base-uncased/model-sim.onnx --overwrite-input-shape input_ids:1,12 token_type_ids:1,12 attention_mask:1,12 

In [40]:
def build_engine(model_file, max_ws=512*1024*1024, fp16=True):
    print("building engine")
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(TRT_LOGGER)

    config = builder.create_builder_config()
    if fp16:
        config.set_flag(trt.BuilderFlag.FP16)
    config.max_workspace_size = max_ws
    
    explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    network = builder.create_network(explicit_batch)
    with trt.OnnxParser(network, TRT_LOGGER) as parser:
        with open(model_file, 'rb') as model:
            parsed = parser.parse(model.read())
            print("network.num_layers", network.num_layers)
            #last_layer = network.get_layer(network.num_layers - 1)
            #network.mark_output(last_layer.get_output(0))
            engine = builder.build_engine(network, config=config)
            return engine
            
engine = build_engine("bert-base-uncased/model-sim.onnx")
# save the paln model
BERT_PATH = 'bert-base-uncased'
plan_path = BERT_PATH +'/model.plan'
with open(plan_path, 'wb') as f:
    f.write(engine.serialize())

building engine


  config.max_workspace_size = max_ws


[02/01/2024-15:13:31] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
network.num_layers 1174


  engine = builder.build_engine(network, config=config)


[02/01/2024-15:14:01] [TRT] [W] TensorRT encountered issues when converting weights between types and that could affect accuracy.
[02/01/2024-15:14:01] [TRT] [W] If this is not the desired behavior, please modify the weights or retrain with regularization to adjust the magnitude of the weights.
[02/01/2024-15:14:01] [TRT] [W] Check verbose logs for the list of affected weights.
[02/01/2024-15:14:01] [TRT] [W] - 133 weights are affected by this issue: Detected subnormal FP16 values.
[02/01/2024-15:14:01] [TRT] [W] - 53 weights are affected by this issue: Detected values less than smallest positive FP16 subnormal value and converted them to the FP16 minimum subnormalized value.
[02/01/2024-15:14:01] [TRT] [W] - 1 weights are affected by this issue: Detected finite FP32 values which would overflow in FP16 and converted them to the closest finite FP16 value.


In [42]:
BERT_PATH = 'bert-base-uncased'
plan_path = BERT_PATH +'/model.plan'

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(TRT_LOGGER)
with open(plan_path, 'rb') as f:
    engine_bytes = f.read()
    engine = runtime.deserialize_cuda_engine(engine_bytes)

493568

In [43]:
bert_context = engine.create_execution_context()

In [51]:
input_ids=encoded_input['input_ids'].numpy().astype(np.int32)
attention_mask = encoded_input['attention_mask'].numpy().astype(np.int32)
token_type_ids = encoded_input['token_type_ids'].numpy().astype(np.int32)
bert_output = np.empty((1, 16, 30522), dtype = np.float32)

In [52]:
print(input_ids.dtype)
print(attention_mask.dtype)
print(token_type_ids.dtype)
print(bert_output.dtype)

int32
int32
int32
float32


In [53]:
batch_size = 1
d_input_ids = cuda.mem_alloc(batch_size * input_ids.nbytes)
d_token_type_ids = cuda.mem_alloc(batch_size * token_type_ids.nbytes)
d_attention_mask = cuda.mem_alloc(batch_size * attention_mask.nbytes)


In [54]:
input_ids.nbytes


64

In [17]:
d_output = cuda.mem_alloc(batch_size * bert_output.nbytes)

In [55]:
bert_output.nbytes

1953408

In [56]:
bindings = [int(d_input_ids), int(d_token_type_ids), int(d_attention_mask), int(d_output)]

In [57]:
stream = cuda.Stream()
# Transfer input data from python buffers to device(GPU)
cuda.memcpy_htod_async(d_input_ids, input_ids, stream)
cuda.memcpy_htod_async(d_token_type_ids, token_type_ids, stream)
cuda.memcpy_htod_async(d_attention_mask, attention_mask, stream)

In [58]:
bert_context.execute_async(batch_size, bindings, stream.handle, None)

[02/01/2024-15:23:40] [TRT] [W] The enqueue() method has been deprecated when used with engines built from a network created with NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag. Please use enqueueV2() instead.
[02/01/2024-15:23:40] [TRT] [W] Also, the batchSize argument passed into this function has no effect on changing the input shapes. Please use setBindingDimensions() function to change input shapes instead.


  bert_context.execute_async(batch_size, bindings, stream.handle, None)


True

In [59]:
cuda.memcpy_dtoh_async(bert_output, d_output, stream)
stream.synchronize()

(1, 16, 30522)

In [60]:
pred = torch.tensor(bert_output)
pred_output_softmax = torch.nn.Softmax()(pred)
_, predicted = torch.max(pred_output_softmax, 1)

  return self._call_impl(*args, **kwargs)


In [63]:
bert_output

array([[[ -7.0117188,  -7.1171875,  -7.03125  , ...,  -6.5976562,
          -6.3359375,  -4.7929688],
        [-11.7734375, -12.1796875, -12.0078125, ..., -11.       ,
          -9.875    , -10.9296875],
        [ -9.109375 ,  -9.8984375,  -9.2734375, ...,  -9.34375  ,
          -7.7851562, -10.765625 ],
        ...,
        [  0.       ,   0.       ,   0.       , ...,   0.       ,
           0.       ,   0.       ],
        [  0.       ,   0.       ,   0.       , ...,   0.       ,
           0.       ,   0.       ],
        [  0.       ,   0.       ,   0.       , ...,   0.       ,
           0.       ,   0.       ]]], dtype=float32)

In [61]:
torch.where(predicted != 0)

(tensor([], dtype=torch.int64), tensor([], dtype=torch.int64))