In [12]:
import paddle2onnx
import os
model_path_prefix = './model/export/float32'

onnx_model = paddle2onnx.export(
    model_file=model_path_prefix + ".pdmodel",
    params_file=model_path_prefix + ".pdiparams",
    opset_version=13,
    enable_onnx_checker=True,
)
infer_model_dir = model_path_prefix.rsplit("/", 1)[0]
float_onnx_file = os.path.join(infer_model_dir, "model.onnx")
with open(float_onnx_file, "wb") as f:
    f.write(onnx_model)

[Paddle2ONNX] Start to parse PaddlePaddle model...
[Paddle2ONNX] Model file path: ./model/export/float32.pdmodel
[Paddle2ONNX] Paramters file path: ./model/export/float32.pdiparams
[Paddle2ONNX] Start to parsing Paddle model...
[Paddle2ONNX] Use opset_version = 13 for ONNX export.
[Paddle2ONNX] PaddlePaddle model is exported as ONNX format now.


In [13]:
import onnxruntime as ort
import psutil
num_threads = psutil.cpu_count(logical=False)
sess_options = ort.SessionOptions()
sess_options.intra_op_num_threads = num_threads
predictor = ort.InferenceSession(
    onnx_model, sess_options=sess_options, providers=["CPUExecutionProvider"]
)

In [17]:
from paddlenlp.transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('./model/checkpoint/', use_fast=True)

[32m[2023-05-17 09:51:02,332] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer'> to load './model/checkpoint/'.[0m


In [80]:
import numpy as np
sentence = "【涉事主体】：工地【主体地址】：荔湾区鹤洞路观鹤小区旁工地【诉求内容】：现希望有关部门尽快跟进处理上述夜间施工问题【事项标签组内容】：发生时间：2020年8月2日22:30开始影响情况：影响周边居民生活【补充信息】：反映2020年8月2日22:30开始，位于荔湾区鹤洞路观鹤小区旁工地。该工地投入施工，产生大量噪音，严重影响附近居民正常生活。【市民回复方式】：电话"
input_data = [sentence]
max_seq_length = 300
data = tokenizer(
    input_data,
    max_length=max_seq_length,
    padding=True,
    truncation=True,
    return_position_ids=False,
    return_attention_mask=False,
)
tokenized_data = {}
for tokenizer_key in data:
    tokenized_data[tokenizer_key] = np.array(data[tokenizer_key], dtype="int64")
preprocess_result = tokenized_data

In [47]:
data

{'input_ids': [[1, 47, 10, 7, 27, 558, 525, 29, 5, 405, 1056, 2]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}

In [81]:
preprocess_result_batch = {}
for tokenizer_key in preprocess_result:
    preprocess_result_batch[tokenizer_key] = [
        preprocess_result[tokenizer_key][0]
    ]

In [82]:
preprocess_result_batch

{'input_ids': [array([    1, 12078,  1157,   104,    57,    82, 12076,    74,    35,
            31, 12078,    57,    82,    31,  1392, 12076,    74,  3367,
          1108,   121,  2244,  1084,   216,   394,  2244,    96,   121,
          1607,    35,    31, 12078,  1005,   323,   103,   390, 12076,
            74,    87,   905,   668,     9,   129,    64,   232,   720,
           532,  1057,    71,   239,    38,    28,   779,   914,   143,
           322,    35,   358,   281, 12078,   104,   236,   275,  1188,
           186,   103,   390, 12076,    74,    34,    21,    36,   143,
            74,  3615,    17,   585,   136,   249,   139,  1592, 12049,
           853,    88,   440,   347,   639,   182,   617,    74,   347,
           639,   544,   554,   529,   119,    21,   205, 12078,   807,
           684,   212,   399, 12076,    74,   451,  1204,  3615,    17,
           585,   136,   249,   139,  1592, 12049,   853,    88,   440,
             4,   144,    37,  3367,  1108,   121, 

In [93]:
result = predictor.run(None, preprocess_result_batch)[0]

In [84]:
result.reshape([-1])

array([  7.799016,  -6.846868,  -9.159973, ..., -18.293648, -18.144815,
       -19.214441], dtype=float32)

In [94]:
result

array([[  7.799016,  -6.846868,  -9.159973, ..., -18.293648, -18.144815,
        -19.214441]], dtype=float32)

In [95]:
def sigmoid_(x):
    """
    compute sigmoid
    """
    return 1 / (1 + np.exp(-x))
threshold = 0.5
sigmoid = np.vectorize(sigmoid_)
prob = sigmoid(result).reshape([-1])


In [96]:
prob

array([9.99590030e-01, 1.06165077e-03, 1.05154664e-04, ...,
       1.13545445e-08, 1.31767034e-08, 4.52141290e-09])

In [97]:
label_list = []
label_dir = os.path.join('data', "label.txt")
with open(label_dir, "r", encoding="utf-8") as f:
    lines = f.readlines()
    for i, line in enumerate(lines):
        label_list.append(line.strip())
f.close()
label = []

In [98]:
for i, p in enumerate(prob):
    if p > threshold:
        label.append(label_list[i])

In [99]:
label

['荔湾区政府', '荔湾区政府##白鹤洞街道', '荔湾区政府##白鹤洞街道##街道综合行政执法办一分队']