In [1]:
import tensorrt as trt
from typing import Optional, List, Tuple
from cuda import cuda, cudart
import numpy as np
import time
import timeit
from tqdm import tqdm
import logging
import collections
import json

In [2]:
import transformers
from transformers import AutoTokenizer
from datasets import load_dataset, load_metric
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator, EvalPrediction
from transformers.trainer_pt_utils import nested_concat, nested_truncate
from accelerate import Accelerator
import evaluate
import torch
import os
import common
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
onnx_model_path = "./onnx/distilbert-squad.onnx"
engine_name = "./engine/distilbert-fp32.engine"

min_batch_size = 1
norm_batch_size = 16
max_batch_size = 64

max_length = 384 # 输入数据的最大长度
doc_stride = 128 # 当切分时，重叠的长度

In [10]:
from accelerate import Accelerator
import datasets
accelerator = Accelerator()

if accelerator.is_local_main_process:
    datasets.utils.logging.set_verbosity_warning()
    transformers.utils.logging.set_verbosity_info()
else:
    datasets.utils.logging.set_verbosity_error()
    transformers.utils.logging.set_verbosity_error()

## 创建 Engine

In [11]:
logger = trt.Logger(trt.Logger.ERROR) 
builder = trt.Builder(logger)                                           # create Builder
config = builder.create_builder_config()                                # create BuidlerConfig to set meta data of the network

# 创建 Network 使用 Explicit Batch 模式，所有的维度都是显式的并且是动态的，意思是在执行的时候，每一维度的长度都可以变化
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))

# 对 ONNX 进行模型解析
parser = trt.OnnxParser(network, logger)
with open(onnx_model_path, "rb") as model:
    if not parser.parse(model.read()):
        print("Failed parsing .onnx file!")
        for error in range(parser.num_errors):
            print(parser.get_error(error))
        exit()
    print("Succeeded parsing .onnx file!")
    
config.max_workspace_size = 1 << 50

# 由于使用 dynamic shape 需要 profile 指定输入范围，并让 profile 优化不同 shape 对应不同的 kernel
profile = builder.create_optimization_profile()

network_inputs = [network.get_input(i) for i in range(network.num_inputs)]
input_names = [_input.name for _input in network_inputs]  # ex: ['input_ids', 'attention_mask']

# 设置优化配置文件中输入张量的形状，包括最小、最优和最大形状
for i in range(len(input_names)):
    profile.set_shape(input_names[i], (min_batch_size, max_length), (norm_batch_size, max_length), (max_batch_size, max_length))
    
# 将优化配置文件添加到TensorRT配置中
config.add_optimization_profile(profile) 

engine = builder.build_engine(network, config)

with open(engine_name, "wb") as f:
    f.write(engine.serialize())

Succeeded parsing .onnx file!


  config.max_workspace_size = 1 << 50
  engine = builder.build_engine(network, config)


In [12]:
for idx in range(engine.num_bindings):
    name = engine.get_tensor_name (idx)
    is_input = engine.get_tensor_mode (name)
    op_type = engine.get_tensor_dtype(name)
    shape = engine.get_tensor_shape(name)

    print('input id:',idx,'   is input: ', is_input,'  binding name:', name, '  shape:', shape, 'type: ', op_type)

input id: 0    is input:  TensorIOMode.INPUT   binding name: input_ids   shape: (-1, 384) type:  DataType.INT32
input id: 1    is input:  TensorIOMode.INPUT   binding name: attention_mask   shape: (-1, 384) type:  DataType.INT32
input id: 2    is input:  TensorIOMode.OUTPUT   binding name: output_start_logits   shape: (-1, 384) type:  DataType.FLOAT
input id: 3    is input:  TensorIOMode.OUTPUT   binding name: output_end_logits   shape: (-1, 384) type:  DataType.FLOAT


## Eval_dataset 处理

In [13]:
model_checkpoint = "distilbert-base-uncased-distilled-squad"

squad_v2 = False
datasets = load_dataset("squad_v2" if squad_v2 else "squad")

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
pad_on_right = tokenizer.padding_side == "right" # 考虑到可能 “context” 出现在左边的情况，一般在右边

Using the latest cached version of the module from /root/.cache/huggingface/modules/datasets_modules/datasets/squad/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453 (last modified on Sun May 28 02:10:10 2023) since it couldn't be found locally at squad., or remotely on the Hugging Face Hub.
Found cached dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 337.24it/s]
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased-distilled-squad/snapshots/bb133e834d7dab8aa8eb3f04e0435db7a3a1ddc8/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-distilled-squad",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout

In [14]:
def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [15]:
eval_examples = datasets["validation"]

eval_dataset = datasets["validation"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=datasets["validation"].column_names
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-ef22020a0f067642.arrow


In [16]:
data_collator = default_data_collator

eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"])

eval_dataloader = DataLoader(
    eval_dataset_for_model, collate_fn=data_collator, batch_size=norm_batch_size
)

## TRT 推理

In [17]:
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

In [23]:
with open(engine_name, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(
    f.read()
) as engine, engine.create_execution_context() as context:
    for i in range(len(input_names)):
        context.set_binding_shape(i, (norm_batch_size, max_length))
    assert context.all_binding_shapes_specified
    
    inputs, outputs, bindings, stream = common.allocate_buffers(engine, context, 0)
    
    # Evaluation
    print("***** Running Evaluation *****")
    print(f"  Num examples = {len(eval_dataset)}")
    print(f"  Batch size = {norm_batch_size}")

    total_time = 0.0
    niter = 0
    start_time = timeit.default_timer()

    all_preds = None
        
    for step, batch in tqdm(enumerate(eval_dataloader)):
        input_ids = np.asarray(batch["input_ids"], dtype=np.int32)
        attention_mask = np.asarray(batch["attention_mask"], dtype=np.int32)

        inputs[0].host = input_ids.ravel()
        inputs[1].host = attention_mask.ravel()
        
        trt_outputs, infer_time = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

        start_logits, end_logits = trt_outputs
        start_logits = torch.tensor(start_logits).reshape(norm_batch_size, max_length)
        end_logits = torch.tensor(end_logits).reshape(norm_batch_size, max_length)
        
        total_time += infer_time
        niter += 1

        # necessary to pad predictions and labels for being gathered
        start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100)
        end_logits = accelerator.pad_across_processes(end_logits, dim=1, pad_index=-100)

        logits = (accelerator.gather(start_logits).cpu().numpy(), accelerator.gather(end_logits).cpu().numpy())
        all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)

    if all_preds is not None:
        all_preds = nested_truncate(all_preds, len(eval_dataset))
        
    evalTime = timeit.default_timer() - start_time
    print(f"Evaluation done in total {evalTime:.3f} secs ({evalTime / len(eval_dataset):.3f} sec per example)")
    # Inference time from TRT
    print("Average Inference Time = {:.3f} ms".format(total_time * 1000 / niter))
    print("Total Inference Time =  {:.3f} ms".format(total_time * 1000))
    print(f"Total Number of Inference =  {niter}")

  context.set_binding_shape(i, (norm_batch_size, max_length))


***** Running Evaluation *****
  Num examples = 10784
  Batch size = 16


674it [00:21, 30.92it/s]

Evaluation done in total 21.803 secs (0.002 sec per example)
Average Inference Time = 22.903 ms
Total Inference Time =  15436.783 ms
Total Number of Inference =  674





In [24]:
common.free_buffers(inputs, outputs, stream)

## 验证

In [25]:
metric = evaluate.load("squad_v2" if squad_v2 else "squad")

In [26]:
logger = logging.getLogger(__name__)

def postprocess_qa_predictions(
    examples,
    features,
    predictions: Tuple[np.ndarray, np.ndarray],
    version_2_with_negative: bool = False,
    n_best_size: int = 20,
    max_answer_length: int = 30,
    null_score_diff_threshold: float = 0.0,
    output_dir: Optional[str] = None,
    prefix: Optional[str] = None,
    log_level: Optional[int] = logging.WARNING,
):
    """
    Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
    original contexts. This is the base postprocessing functions for models that only return start and end logits.

    Args:
        examples: The non-preprocessed dataset (see the main script for more information).
        features: The processed dataset (see the main script for more information).
        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
            first dimension must match the number of elements of :obj:`features`.
        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not the underlying dataset contains examples with no answers.
        n_best_size (:obj:`int`, `optional`, defaults to 20):
            The total number of n-best predictions to generate when looking for an answer.
        max_answer_length (:obj:`int`, `optional`, defaults to 30):
            The maximum length of an answer that can be generated. This is needed because the start and end predictions
            are not conditioned on one another.
        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
            The threshold used to select the null answer: if the best answer has a score that is less than the score of
            the null answer minus this threshold, the null answer is selected for this example (note that the score of
            the null answer for an example giving several features is the minimum of the scores for the null answer on
            each feature: all features must be aligned on the fact they `want` to predict a null answer).

            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
        output_dir (:obj:`str`, `optional`):
            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
            answers, are saved in `output_dir`.
        prefix (:obj:`str`, `optional`):
            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
            ``logging`` log level (e.g., ``logging.WARNING``)
    """
    if len(predictions) != 2:
        raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).")
    all_start_logits, all_end_logits = predictions

    if len(predictions[0]) != len(features):
        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")

    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()
    if version_2_with_negative:
        scores_diff_json = collections.OrderedDict()

    # Logging.
    logger.setLevel(log_level)
    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_prediction = None
        prelim_predictions = []

        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]
            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
            # available in the current feature.
            token_is_max_context = features[feature_index].get("token_is_max_context", None)

            # Update minimum null prediction.
            feature_null_score = start_logits[0] + end_logits[0]
            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
                min_null_prediction = {
                    "offsets": (0, 0),
                    "score": feature_null_score,
                    "start_logit": start_logits[0],
                    "end_logit": end_logits[0],
                }

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or len(offset_mapping[start_index]) < 2
                        or offset_mapping[end_index] is None
                        or len(offset_mapping[end_index]) < 2
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    # Don't consider answer that don't have the maximum context available (if such information is
                    # provided).
                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
                        continue

                    prelim_predictions.append(
                        {
                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
                            "score": start_logits[start_index] + end_logits[end_index],
                            "start_logit": start_logits[start_index],
                            "end_logit": end_logits[end_index],
                        }
                    )
        if version_2_with_negative:
            # Add the minimum null prediction
            prelim_predictions.append(min_null_prediction)
            null_score = min_null_prediction["score"]

        # Only keep the best `n_best_size` predictions.
        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]

        # Add back the minimum null prediction if it was removed because of its low score.
        if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions):
            predictions.append(min_null_prediction)

        # Use the offsets to gather the answer text in the original context.
        context = example["context"]
        for pred in predictions:
            offsets = pred.pop("offsets")
            pred["text"] = context[offsets[0] : offsets[1]]

        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
        # failure.
        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})

        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
        # the LogSumExp trick).
        scores = np.array([pred.pop("score") for pred in predictions])
        exp_scores = np.exp(scores - np.max(scores))
        probs = exp_scores / exp_scores.sum()

        # Include the probabilities in our predictions.
        for prob, pred in zip(probs, predictions):
            pred["probability"] = prob

        # Pick the best prediction. If the null answer is not possible, this is easy.
        if not version_2_with_negative:
            all_predictions[example["id"]] = predictions[0]["text"]
        else:
            # Otherwise we first need to find the best non-empty prediction.
            i = 0
            while predictions[i]["text"] == "":
                i += 1
            best_non_null_pred = predictions[i]

            # Then we compare to the null prediction using the threshold.
            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
            scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
            if score_diff > null_score_diff_threshold:
                all_predictions[example["id"]] = ""
            else:
                all_predictions[example["id"]] = best_non_null_pred["text"]

        # Make `predictions` JSON-serializable by casting np.float back to float.
        all_nbest_json[example["id"]] = [
            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
            for pred in predictions
        ]

    # If we have an output_dir, let's save all those dicts.
    if output_dir is not None:
        if not os.path.isdir(output_dir):
            raise EnvironmentError(f"{output_dir} is not a directory.")

        prediction_file = os.path.join(
            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
        )
        nbest_file = os.path.join(
            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
        )
        if version_2_with_negative:
            null_odds_file = os.path.join(
                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
            )

        logger.info(f"Saving predictions to {prediction_file}.")
        with open(prediction_file, "w") as writer:
            writer.write(json.dumps(all_predictions, indent=4) + "\n")
        logger.info(f"Saving nbest_preds to {nbest_file}.")
        with open(nbest_file, "w") as writer:
            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
        if version_2_with_negative:
            logger.info(f"Saving null_odds to {null_odds_file}.")
            with open(null_odds_file, "w") as writer:
                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")

    return all_predictions

In [27]:
# Post-processing:
def post_processing_function(examples, features, predictions, stage="eval"):
    # Post-processing: we match the start logits and end logits to answers in the original context.
    predictions = postprocess_qa_predictions(
        examples=examples,
        features=features,
        predictions=predictions,
        version_2_with_negative=squad_v2,
        n_best_size=20,
        max_answer_length=30,
        null_score_diff_threshold=0.0,
        output_dir="./results",
        prefix=stage,
    )
    # Format the result to the format the metric expects.
    if squad_v2:
        formatted_predictions = [
            {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
        ]
    else:
        formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]

    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return EvalPrediction(predictions=formatted_predictions, label_ids=references)

In [29]:
! mkdir -p ./results
prediction = post_processing_function(eval_examples, eval_dataset, all_preds)

100%|██████████████████████████████████████████████████████████████████████| 10570/10570 [00:26<00:00, 400.99it/s]


In [30]:
eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
print(f"Evaluation metrics: {eval_metric}")

Evaluation metrics: {'exact_match': 79.0728476821192, 'f1': 86.86331096544208}
