In [1]:
import ppq.lib as PFL
from ppq.api import ENABLE_CUDA_KERNEL, load_onnx_graph
from ppq.core import TargetPlatform
from ppq.executor import TorchExecutor
from ppq.quantization.optim import (LayerwiseEqualizationPass,
                                    LearnedStepSizePass, ParameterQuantizePass,
                                    RuntimeCalibrationPass)
from ppq.quantization.quantizer import TensorRTQuantizer
from ppq.core import (ChannelwiseTensorQuantizationConfig, OperationMeta,
                      OperationQuantizationConfig, QuantizationPolicy,
                      QuantizationProperty, QuantizationStates, RoundingPolicy,
                      TargetPlatform)
from ppq.IR import BaseGraph

import torch

import tensorrt as trt
from typing import Optional, List, Tuple
from cuda import cuda, cudart

import numpy as np

from squad_dataset import get_squad_dataset, post_processing_function, postprocess_qa_predictions
from transformers import default_data_collator, EvalPrediction
from transformers.trainer_pt_utils import nested_concat, nested_truncate
from torch.utils.data import DataLoader

from typing import Union
import common
import timeit
from tqdm import tqdm
from accelerate import Accelerator
import evaluate
from squad_dataset import get_squad_dataset, post_processing_function, postprocess_qa_predictions

accelerator = Accelerator()


      ____  ____  __   ____                    __              __
     / __ \/ __ \/ /  / __ \__  ______ _____  / /_____  ____  / /
    / /_/ / /_/ / /  / / / / / / / __ `/ __ \/ __/ __ \/ __ \/ /
   / ____/ ____/ /__/ /_/ / /_/ / /_/ / / / / /_/ /_/ / /_/ / /
  /_/   /_/   /_____\___\_\__,_/\__,_/_/ /_/\__/\____/\____/_/




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_checkpoint = "distilbert-base-uncased-distilled-squad"

onnxFile = "./onnx/distilbert-squad.onnx"
ppq_onnxFile = "./onnx/distilbert-squad_int8(PPQ).onnx"
int8_scale_file = "./distilbert-squad_int8(PPQ).json"
engine_file = './engine/distilbert-squad_int8(PPQ).engine'

min_batch_size = 1
norm_batch_size = 16
max_batch_size = 64

max_length = 384 # 输入数据的最大长度
doc_stride = 128 # 当切分时，重叠的长度

norm_shape = (norm_batch_size, max_length)

In [3]:
eval_examples, eval_dataset = get_squad_dataset(model_checkpoint, for_model = False)
eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"])

data_collator = default_data_collator

eval_dataloader = DataLoader(
    eval_dataset_for_model, collate_fn=data_collator, batch_size=norm_batch_size
)

Using the latest cached version of the module from /root/.cache/huggingface/modules/datasets_modules/datasets/squad/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453 (last modified on Sun May 28 02:10:10 2023) since it couldn't be found locally at squad., or remotely on the Hugging Face Hub.
Found cached dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 427.71it/s]
Loading cached processed dataset at /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-07fafce74b00717e.arrow


In [None]:
graph = load_onnx_graph(onnx_import_file=onnxFile)

quantizer   = PFL.Quantizer(platform=TargetPlatform.TRT_INT8, graph=graph)
  
for name, op in graph.operations.items():
    if op.type in {'Conv', 'ConvTranspose', 'MatMul', 'Gemm', 
                   'PPQBiasFusedMatMul', 'LayerNormalization'}:
        quantizer.quantize_operation(name, platform=TargetPlatform.TRT_INT8)
    
pipeline = PFL.Pipeline([
            ParameterQuantizePass(),
            RuntimeCalibrationPass(),
            ])

 # call pipeline.
executor = TorchExecutor(graph=graph)
executor.tracing_operation_meta([torch.zeros(norm_shape, dtype=torch.int32).cuda(),torch.zeros(norm_shape, dtype=torch.int32).cuda()])
executor.load_graph(graph=graph)

def collate_fn(batch):
    collated_batch = {}
    for key in batch:
        collated_batch[key] = batch[key].to('cuda')
    return collated_batch

pipeline.optimize(
    graph=graph, dataloader=eval_dataloader, verbose=True,
    calib_steps=16, collate_fn=collate_fn, executor=executor)

[04:11:15] PPQ Parameter Quantization Pass Running ... Finished.
[04:11:15] PPQ Runtime Calibration Pass Running ...    

Calibration Progress(Phase 1):  56%|██████████████████████████▍                    | 9/16 [01:44<01:22, 11.78s/it]

In [None]:
exporter = PFL.Exporter(platform=TargetPlatform.TRT_INT8)
exporter.export(file_path=ppq_onnxFile, graph=graph, config_path=int8_scale_file)

In [None]:
from ppq.utils.TensorRTUtil import build_engine 
build_engine(onnx_file=ppq_onnxFile, int8_scale_file=int8_scale_file, engine_file=engine_file, int8=True)

In [None]:
logger = trt.Logger(trt.Logger.INFO) 

In [None]:
with open(engine_file, "rb") as f, trt.Runtime(logger) as runtime, runtime.deserialize_cuda_engine(
    f.read()
) as engine, engine.create_execution_context() as context:
    input_len = 0
    print("***** Engine IO *****")
    for idx in range(engine.num_bindings):
        name = engine.get_tensor_name (idx)
        is_input = engine.get_tensor_mode (name)
        if is_input == trt.TensorIOMode.INPUT:
            input_len += 1
        op_type = engine.get_tensor_dtype(name)
        shape = engine.get_tensor_shape(name)
        print('input id:',idx,'   is input: ', is_input,'  binding name:', name, '  shape:', shape, 'type: ', op_type)
    print("*****           *****")
        
    for i in range(input_len):
        context.set_binding_shape(i, (norm_batch_size, max_length))
    assert context.all_binding_shapes_specified
    
    inputs, outputs, bindings, stream = common.allocate_buffers(engine, context, 0)
    
    # Evaluation
    print("***** Running Evaluation *****")
    print(f"  Num examples = {len(eval_dataset)}")
    print(f"  Batch size = {norm_batch_size}")

    total_time = 0.0
    niter = 0
    start_time = timeit.default_timer()

    all_preds = None
        
    for step, batch in tqdm(enumerate(eval_dataloader)):
        input_ids = np.asarray(batch["input_ids"], dtype=np.int32)
        attention_mask = np.asarray(batch["attention_mask"], dtype=np.int32)

        inputs[0].host = input_ids.ravel()
        inputs[1].host = attention_mask.ravel()
        
        trt_outputs, infer_time = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

        start_logits, end_logits = trt_outputs
        start_logits = torch.tensor(start_logits).reshape(norm_batch_size, max_length)
        end_logits = torch.tensor(end_logits).reshape(norm_batch_size, max_length)
        
        total_time += infer_time
        niter += 1

        # necessary to pad predictions and labels for being gathered
        start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100)
        end_logits = accelerator.pad_across_processes(end_logits, dim=1, pad_index=-100)

        logits = (accelerator.gather(start_logits).cpu().numpy(), accelerator.gather(end_logits).cpu().numpy())
        all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)

    if all_preds is not None:
        all_preds = nested_truncate(all_preds, len(eval_dataset))
        
    evalTime = timeit.default_timer() - start_time
    print(f"Evaluation done in total {evalTime:.3f} secs ({evalTime / len(eval_dataset):.3f} sec per example)")
    # Inference time from TRT
    print("Average Inference Time = {:.3f} ms".format(total_time * 1000 / niter))
    print("Total Inference Time =  {:.3f} ms".format(total_time * 1000))
    print(f"Total Number of Inference =  {niter}")

In [None]:
squad_v2 = False
metric = evaluate.load("squad_v2" if squad_v2 else "squad")

prediction = post_processing_function(eval_examples, eval_dataset, all_preds)

eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
print(f"Evaluation metrics: {eval_metric}")