### 01.测试 Bert Model
1. 初始化tokenizer和Bert model，设置用于测试的text
2. 基于pytorch执行bert推理，输出概率最高的10个词
3. 保存输出信息，用来和之后转换过的模型进行对比


In [1]:
import torch
from torch.nn import functional as F
from transformers import BertTokenizer, BertForMaskedLM
import time
import numpy as np
import onnxruntime as ort

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BERT_PATH = '../bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)
model = BertForMaskedLM.from_pretrained(BERT_PATH, return_dict = True)
text = "The capital of France, " + tokenizer.mask_token + ", contains the Eiffel Tower."

Some weights of the model checkpoint at ../bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
encoded_input = tokenizer.encode_plus(text, return_tensors = "pt")
mask_index = torch.where(encoded_input["input_ids"][0] == tokenizer.mask_token_id)
print("input ids: \n",encoded_input["input_ids"])

# warm up
for i in range(5):
    output = model(**encoded_input)
start_time = time.perf_counter()
# 计算平均推理时间
for i in range(10):
    output = model(**encoded_input)
end_time = time.perf_counter()

print("output shape: ", output[0].shape)
logits = output.logits
softmax = F.softmax(logits, dim = -1)
mask_word = softmax[0, mask_index, :]
top_10 = torch.topk(mask_word, 10, dim = 1)[1][0]
print("model test topk10 output:")
for token in top_10:
    word = tokenizer.decode([token])
    new_sentence = text.replace(tokenizer.mask_token, word)
    print(new_sentence)
print('*' * 40)
print("pytorch with bin model running time:", (end_time-start_time)*100, "ms")

input ids: 
 tensor([[  101,  1996,  3007,  1997,  2605,  1010,   103,  1010,  3397,  1996,
          1041, 13355,  2884,  3578,  1012,   102]])
output shape:  torch.Size([1, 16, 30522])
model test topk10 output:
The capital of France, paris, contains the Eiffel Tower.
The capital of France, lyon, contains the Eiffel Tower.
The capital of France, lille, contains the Eiffel Tower.
The capital of France, toulouse, contains the Eiffel Tower.
The capital of France, marseille, contains the Eiffel Tower.
The capital of France, orleans, contains the Eiffel Tower.
The capital of France, strasbourg, contains the Eiffel Tower.
The capital of France, nice, contains the Eiffel Tower.
The capital of France, cannes, contains the Eiffel Tower.
The capital of France, versailles, contains the Eiffel Tower.
****************************************
pytorch with bin model running time: 0.02549151040002471


In [8]:
# save inputs and output
print("Saving inputs and output to case_data.npz ...")
position_ids = torch.arange(0, encoded_input['input_ids'].shape[1]).int().view(1, -1)
print("position id: ",position_ids)
input_ids=encoded_input['input_ids'].int().detach().numpy()
token_type_ids=encoded_input['token_type_ids'].int().detach().numpy()
print("input_id shape: ",input_ids.shape)
# save data
npz_file = BERT_PATH + '/case_data.npz'
np.savez(npz_file,
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            logits=output[0].detach().numpy())

data = np.load(npz_file)
print("saved input ids: \n", data['input_ids'])

Saving inputs and output to case_data.npz ...
position id:  tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]],
       dtype=torch.int32)
input_id shape:  (1, 16)
saved input ids: 
 [[  101  1996  3007  1997  2605  1010   103  1010  3397  1996  1041 13355
   2884  3578  1012   102]]


In [6]:
encoded_input["attention_mask"].shape

torch.Size([1, 16])

### 02. 将模型转换为ONNX格式
使用torch.onnx.export() 进行转换

In [12]:
# convert model to onnx
model.eval()
export_model_path = BERT_PATH + "/model.onnx"
opset_version = 16
symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
torch.onnx.export(  model,                                            
                    args=tuple(encoded_input.values()),               # model input (or a tuple for multiple inputs)
                    f=export_model_path,                              # where to save the model (can be a file or file-like object)
                    opset_version=opset_version,                      # the ONNX version to export the model to
                    do_constant_folding=False,                        # whether to execute constant folding for optimization
                    input_names=['input_ids',                         # the model's input names
                                'attention_mask',
                                'token_type_ids'],
                    output_names=['logits'],                          # the model's output names
                    dynamic_axes={'input_ids': symbolic_names,        # variable length axes
                                'attention_mask' : symbolic_names,
                                'token_type_ids' : symbolic_names,
                                'logits' : symbolic_names})
print("Model exported at ", export_model_path)


Model exported at  bert-base-uncased/model.onnx


### 03. 使用onnxruntime进行onnx推理
与pytorch和tensorrt的推理时间相对比

In [10]:
# 检查设备是否为GPU
print("onnxruntime version:", ort.__version__)
print("onnxruntime device:", ort.get_device())

onnxruntime version: 1.16.3
onnxruntime device: GPU


In [13]:
# 加载模型
session = ort.InferenceSession(export_model_path)
# 执行推理
# warmup
for i in range(5):
    outputs = session.run(['logits'], {'input_ids': encoded_input['input_ids'].numpy(),
                                    'attention_mask': encoded_input['attention_mask'].numpy(),
                                   'token_type_ids': encoded_input['token_type_ids'].numpy()})[0]
start_time = time.perf_counter()
for i in range(10):
    outputs = session.run(['logits'], {'input_ids': encoded_input['input_ids'].numpy(),
                                    'attention_mask': encoded_input['attention_mask'].numpy(),
                                   'token_type_ids': encoded_input['token_type_ids'].numpy()})[0]
end_time = time.perf_counter()

# 检查转换后的模型的精度损失情况
required_precission = 1e-4
precesion_loss = np.abs(outputs - data['logits'])
boolean_mask = precesion_loss > required_precission
if(len(np.where(boolean_mask)[0]) > 0):
    print("Convert ERROR!")
else:
    print("Convert SUCCESS!!!!!!")
print('*' * 40)
print("pytorch with bin model running time:", (end_time-start_time)*100, "ms")

Convert SUCCESS!!!!!!
****************************************
pytorch with bin model running time: 0.018559776899928694
