In [1]:
import tensorrt as trt
from typing import Optional, List, Tuple
from cuda import cuda, cudart
import numpy as np
import time
import timeit
from tqdm import tqdm
import logging
import collections
import json

import transformers
from transformers import AutoTokenizer
from datasets import load_dataset, load_metric
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator, EvalPrediction
from transformers.trainer_pt_utils import nested_concat, nested_truncate
from accelerate import Accelerator
import evaluate
import torch
import os
# os.remove('./int8.cache')
import common
from torch.utils.data import DataLoader

from squad_dataset import get_squad_dataset, post_processing_function, postprocess_qa_predictions

accelerator = Accelerator()
logger = trt.Logger(trt.Logger.INFO) 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_checkpoint = "distilbert-base-uncased-distilled-squad"

onnx_model_path = "./onnx/distilbert-squad.onnx"
engine_name = "./engine/distilbert-int8.engine"
cacheFile = "./int8.cache"

min_batch_size = 1
norm_batch_size = 16
max_batch_size = 64

max_length = 384 # 输入数据的最大长度
doc_stride = 128 # 当切分时，重叠的长度

norm_shape = (norm_batch_size, max_length)

## 创建 Engine

In [3]:
eval_examples, eval_dataset = get_squad_dataset(model_checkpoint, for_model = False)
eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"])

data_collator = default_data_collator

eval_dataloader = DataLoader(
    eval_dataset_for_model, collate_fn=data_collator, batch_size=norm_batch_size
)

Using the latest cached version of the module from /root/.cache/huggingface/modules/datasets_modules/datasets/squad/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453 (last modified on Sun May 28 02:10:10 2023) since it couldn't be found locally at squad., or remotely on the Hugging Face Hub.
Found cached dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 465.00it/s]
Loading cached processed dataset at /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-07fafce74b00717e.arrow


In [4]:
class SquadCalibrator(trt.IInt8EntropyCalibrator2):
    def __init__(self, dataset, input_shape, cache_file, num_calib = 100):
        trt.IInt8EntropyCalibrator2.__init__(self)
        self.num_calib = num_calib
        self.dataset = dataset[:num_calib]
        self.input_shape = input_shape
        self.batch_size = input_shape[0]
        self.cacheFile = cache_file
        
        self.buffer_size = trt.volume(self.input_shape) * trt.float32.itemsize
        _, self.device_input_ids = cudart.cudaMalloc(self.buffer_size)
        _, self.device_attention_mask = cudart.cudaMalloc(self.buffer_size)
        
        self.batches = self.batchGenerator()
    
    def __del__(self):
        cudart.cudaFree(self.device_input_ids)
        cudart.cudaFree(self.device_attention_mask)
        
    def batchGenerator(self):
        for i in range(0, self.num_calib, self.batch_size):
            input_ids_batch = self.dataset["input_ids"][i: i + self.batch_size]
            attention_mask_batch = self.dataset["attention_mask"][i: i + self.batch_size]
            input_ids = np.asarray(input_ids_batch, dtype=np.int32)
            attention_mask = np.asarray(attention_mask_batch, dtype=np.int32)
            yield input_ids.ravel(), attention_mask.ravel()
            
    def get_batch_size(self):  # necessary API
        return self.batch_size  
    
    def get_batch(self, names=[], inputNodeName=None): # necessary API
        try:
            input_ids, attention_mask= next(self.batches)
            cudart.cudaMemcpy(
                self.device_input_ids, 
                input_ids.ctypes.data, 
                self.buffer_size, 
                cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
            )
            cudart.cudaMemcpy(
                self.device_attention_mask, 
                attention_mask.ctypes.data, 
                self.buffer_size, 
                cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
            )
            return [int(self.device_input_ids), int(self.device_attention_mask)]
        except StopIteration:
            return None 
        
    # 其他方法参照MyCalibrator
    def read_calibration_cache(self):  # necessary API
        if os.path.exists(self.cacheFile):
            print("Succeed finding cahce file: %s" % (self.cacheFile))
            with open(self.cacheFile, "rb") as f:
                cache = f.read()
                return cache
        else:
            print("Failed finding int8 cache!")
            return

    def write_calibration_cache(self, cache):  # necessary API
        with open(self.cacheFile, "wb") as f:
            f.write(cache)
        print("Succeed saving int8 cache!")
        return

In [5]:
logger = trt.Logger(trt.Logger.INFO) 
builder = trt.Builder(logger)                                           # create Builder
config = builder.create_builder_config()                                # create BuidlerConfig to set meta data of the network

config.set_flag(trt.BuilderFlag.INT8)
config.int8_calibrator = SquadCalibrator(eval_dataset, norm_shape, cacheFile)

# 创建 Network 使用 Explicit Batch 模式，所有的维度都是显式的并且是动态的，意思是在执行的时候，每一维度的长度都可以变化
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))

# 对 ONNX 进行模型解析
parser = trt.OnnxParser(network, logger)
with open(onnx_model_path, "rb") as model:
    if not parser.parse(model.read()):
        print("Failed parsing .onnx file!")
        for error in range(parser.num_errors):
            print(parser.get_error(error))
        exit()
    print("Succeeded parsing .onnx file!")
    
network_inputs = [network.get_input(i) for i in range(network.num_inputs)]
input_names = [_input.name for _input in network_inputs]  # ex: ['input_ids', 'attention_mask']

config.max_workspace_size = 1 << 50

# 由于使用 dynamic shape 需要 profile 指定输入范围，并让 profile 优化不同 shape 对应不同的 kernel
profile = builder.create_optimization_profile()

# 设置优化配置文件中输入张量的形状，包括最小、最优和最大形状
for i in range(len(input_names)):
    profile.set_shape(input_names[i], (min_batch_size, max_length), (norm_batch_size, max_length), (max_batch_size, max_length))
    
# 将优化配置文件添加到TensorRT配置中
config.add_optimization_profile(profile) 

engine = builder.build_engine(network, config)

[06/01/2023-03:51:03] [TRT] [I] [MemUsageChange] Init CUDA: CPU +11, GPU +0, now: CPU 142, GPU 262 (MiB)
[06/01/2023-03:51:09] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +1225, GPU +268, now: CPU 1443, GPU 530 (MiB)
[06/01/2023-03:51:09] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
Succeeded parsing .onnx file!
[06/01/2023-03:51:09] [TRT] [I] Graph optimization time: 0.0544468 seconds.
Failed finding int8 cache!
[06/01/2023-03:51:09] [TRT] [I] Timing cache disabled. Turning it on will improve builder speed.
[06/01/2023-03:51:09] [TRT] [W] Calibration Profile is not defined. Calibrating with Profile 0


  config.max_workspace_size = 1 << 50
  engine = builder.build_engine(network, config)


[06/01/2023-03:51:12] [TRT] [I] Detected 2 inputs and 2 output network tensors.
[06/01/2023-03:51:15] [TRT] [I] Total Host Persistent Memory: 368928
[06/01/2023-03:51:15] [TRT] [I] Total Device Persistent Memory: 0
[06/01/2023-03:51:15] [TRT] [I] Total Scratch Memory: 0
[06/01/2023-03:51:15] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 9 MiB, GPU 256 MiB
[06/01/2023-03:51:15] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 468 steps to complete.
[06/01/2023-03:51:15] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 48.7988ms to assign 28 blocks to 468 nodes requiring 356303872 bytes.
[06/01/2023-03:51:15] [TRT] [I] Total Activation Memory: 356303872
[06/01/2023-03:51:15] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +339, now: CPU 0, GPU 593 (MiB)
[06/01/2023-03:51:15] [TRT] [I] Starting Calibration.
[06/01/2023-03:51:15] [TRT] [I]   Calibrated batch 0 in 0.268789

In [6]:
with open(engine_name, "wb") as f:
    f.write(engine.serialize())

## TRT 推理

In [4]:
with open(engine_name, "rb") as f, trt.Runtime(logger) as runtime, runtime.deserialize_cuda_engine(
    f.read()
) as engine, engine.create_execution_context() as context:
    input_len = 0
    print("***** Engine IO *****")
    for idx in range(engine.num_bindings):
        name = engine.get_tensor_name (idx)
        is_input = engine.get_tensor_mode (name)
        if is_input == trt.TensorIOMode.INPUT:
            input_len += 1
        op_type = engine.get_tensor_dtype(name)
        shape = engine.get_tensor_shape(name)
        print('input id:',idx,'   is input: ', is_input,'  binding name:', name, '  shape:', shape, 'type: ', op_type)
    print("*****           *****")
        
    for i in range(input_len):
        context.set_binding_shape(i, (norm_batch_size, max_length))
    assert context.all_binding_shapes_specified
    
    inputs, outputs, bindings, stream = common.allocate_buffers(engine, context, 0)
    
    # Evaluation
    print("***** Running Evaluation *****")
    print(f"  Num examples = {len(eval_dataset)}")
    print(f"  Batch size = {norm_batch_size}")

    total_time = 0.0
    niter = 0
    start_time = timeit.default_timer()

    all_preds = None
        
    for step, batch in tqdm(enumerate(eval_dataloader)):
        input_ids = np.asarray(batch["input_ids"], dtype=np.int32)
        attention_mask = np.asarray(batch["attention_mask"], dtype=np.int32)

        inputs[0].host = input_ids.ravel()
        inputs[1].host = attention_mask.ravel()
        
        trt_outputs, infer_time = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

        start_logits, end_logits = trt_outputs
        start_logits = torch.tensor(start_logits).reshape(norm_batch_size, max_length)
        end_logits = torch.tensor(end_logits).reshape(norm_batch_size, max_length)
        
        total_time += infer_time
        niter += 1

        # necessary to pad predictions and labels for being gathered
        start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100)
        end_logits = accelerator.pad_across_processes(end_logits, dim=1, pad_index=-100)

        logits = (accelerator.gather(start_logits).cpu().numpy(), accelerator.gather(end_logits).cpu().numpy())
        all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)

    if all_preds is not None:
        all_preds = nested_truncate(all_preds, len(eval_dataset))
        
    evalTime = timeit.default_timer() - start_time
    print(f"Evaluation done in total {evalTime:.3f} secs ({evalTime / len(eval_dataset):.3f} sec per example)")
    # Inference time from TRT
    print("Average Inference Time = {:.3f} ms".format(total_time * 1000 / niter))
    print("Total Inference Time =  {:.3f} ms".format(total_time * 1000))
    print(f"Total Number of Inference =  {niter}")

[06/01/2023-03:55:21] [TRT] [I] Loaded engine size: 136 MiB
[06/01/2023-03:55:21] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +132, now: CPU 0, GPU 132 (MiB)
[06/01/2023-03:55:21] [TRT] [I] [MS] Running engine with multi stream info
[06/01/2023-03:55:21] [TRT] [I] [MS] Number of aux streams is 2
[06/01/2023-03:55:21] [TRT] [I] [MS] Number of total worker streams is 3
[06/01/2023-03:55:21] [TRT] [I] [MS] The main stream provided by execute/enqueue calls is the first worker stream
[06/01/2023-03:55:21] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +1440, now: CPU 0, GPU 1572 (MiB)
input id: 0    is input:  TensorIOMode.INPUT   binding name: input_ids   shape: (-1, 384) type:  DataType.INT32
input id: 1    is input:  TensorIOMode.INPUT   binding name: attention_mask   shape: (-1, 384) type:  DataType.INT32
input id: 2    is input:  TensorIOMode.OUTPUT   binding name: output_start_logits  

  context.set_binding_shape(i, (norm_batch_size, max_length))


***** Running Evaluation *****
  Num examples = 10784
  Batch size = 16


674it [00:16, 41.84it/s]

  Evaluation done in total 16.137154850177467 secs (0.0014963978904096317 sec per example)
Average Inference Time = 12.565 ms
Total Inference Time =  8468.917 ms
Total Number of Inference =  %d 674





## 验证

In [5]:
squad_v2 = False
metric = evaluate.load("squad_v2" if squad_v2 else "squad")

prediction = post_processing_function(eval_examples, eval_dataset, all_preds)

eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
print(f"Evaluation metrics: {eval_metric}")

100%|██████████████████████████████████████████████████████████████████████| 10570/10570 [00:26<00:00, 406.00it/s]


Evaluation metrics: {'exact_match': 67.43614001892148, 'f1': 78.40160484497551}
