In [46]:
bert_model_path = 'C:\\Users\\12390\\Documents\\projects\\yolo\\data\\action_model\\checkpoint-318'
onnx_model_path = 'C:\\Users\\12390\\Documents\\projects\\ai_person\\data\\bert_action_0908.onnx'

### 将bert模型导入onnx格式，可以减轻模型大小，供边缘侧调用

In [47]:
import os
import torch
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
)

In [48]:
tokenizer = BertTokenizer.from_pretrained(bert_model_path)
model = BertForSequenceClassification.from_pretrained(bert_model_path)

In [49]:
inputs = tokenizer(
    "这部电影太棒了！特效非常震撼！",
    truncation=True,
    max_length=128,
    padding="max_length",
    return_tensors="pt"
).to(model.device)

In [50]:
torch.onnx.export(
    model,                         
    tuple(inputs.values()),       
    onnx_model_path,       
    export_params=True,              
    opset_version=14,            
    do_constant_folding=True,       
    input_names=['input_ids',      
                 'attention_mask',
                 'token_type_ids'],
    output_names=['logits'],
    dynamic_axes={      
        'input_ids': {0: 'batch_size', 1: 'seq_length'},
        'attention_mask': {0: 'batch_size', 1: 'seq_length'},
        'token_type_ids': {0: 'batch_size', 1: 'seq_length'}
    }
)

### 加载并使用onnx模型

In [52]:
import onnxruntime as ort
import numpy as np
from transformers import BertTokenizer
import time

In [19]:
label2id = {
    '点头': 0,
    '挥手': 1,
    '其他': 2,
    '思考': 3,
    '摇头': 4,
}
id2label = {v: k for k, v in label2id.items()}

In [25]:
ort_session = ort.InferenceSession(onnx_model_path)

In [26]:
tokenizer = BertTokenizer.from_pretrained(bert_model_path)

In [65]:
# query = "抱歉，都怪我没有说清楚"
# query = "你好啊，很高兴见到你"
# query = "再见，很高兴能帮到你"
# query = "好啊好啊，就这么说定了"
query = "我也很高兴再次见到你。"

In [67]:
st = time.time()

inputs = tokenizer(
    query,
    truncation=True,
    max_length=128,
    padding="max_length",
    return_tensors="pt"
).to('cpu')

inputs_onnx = {
    'input_ids': inputs['input_ids'].numpy(),
    'attention_mask': inputs['attention_mask'].numpy(),
    'token_type_ids': inputs['token_type_ids'].numpy()
}
outputs = ort_session.run(None, inputs_onnx)
logits = outputs[0]

probs = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
predicted_class = np.argmax(probs, axis=1)

label = id2label[predicted_class[0]]

print(f"预测类别: {predicted_class[0]}, {label}")
print(f"各类别概率: {probs[0]}")
print(f"cost time: {time.time() - st}")

预测类别: 0, 点头
各类别概率: [0.71585363 0.26741728 0.00907105 0.00435702 0.00330099]
cost time: 0.4465160369873047


## 结论

1. 模型的确变小了，原始bert模型大概1.4G，转为onnx后，仅0.4G
2. 但基于onnx模型进行预测，时间上并没有特别大的优势，一次预测大概耗时0.5s，在cpu环境下