In [3]:
import os
import sys

ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

# huggingface
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    GPT2Config,
)

import tensorrt as trt
from tensorrt.tensorrt import ICudaEngine, Logger, Runtime

from transformer_deploy.backends.trt_utils import build_engine, load_engine, save_engine
from onnxruntime.transformers.gpt2_helper import Gpt2Helper, MyGPT2LMHeadModel
from transformers import AutoConfig
from transformers import PretrainedConfig
from transformer_deploy.backends.ort_utils import create_model_for_provider
from typing import Dict, Callable
import numpy as np
import time
from transformer_deploy.backends.pytorch_utils import get_model_size
from transformer_deploy.backends.ort_utils import optimize_onnx
from transformer_deploy.backends.pytorch_utils import convert_to_onnx
from transformers import BatchEncoding
import torch
from torch.nn import Module
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
from transformers.generation_utils import GenerationMixin
from transformers import AutoTokenizer

<a id="1"></a>

## 1. Download HuggingFace GPT-2 model 

First, we download the original HuggingFace PyTorch GPT-2 model from HuggingFace model hubs, together with its associated tokernizer.

The GPT-2 variants supported by TensorRT 8 are: gpt2 (117M), gpt2-large (774M).

In [4]:
# download model and tokernizer
GPT2_VARIANT = "gpt2"  # choices: gpt2 | gpt2-large

model: GPT2LMHeadModel = GPT2LMHeadModel.from_pretrained(GPT2_VARIANT)

config = GPT2Config(GPT2_VARIANT)
tokenizer = AutoTokenizer.from_pretrained(GPT2_VARIANT)

In [5]:
# save model locally
pytorch_model_dir = "./models/{}/pytorch".format(GPT2_VARIANT)
!mkdir -p $pytorch_model_dir

model.save_pretrained(pytorch_model_dir)
print("Pytorch Model saved to {}".format(pytorch_model_dir))

Pytorch Model saved to ./models/gpt2/pytorch


### Inference with PyTorch model

#### Single example inference

In [6]:
# carry out inference with a single sample
inputs = tokenizer("Hello, my dog is ", return_tensors="pt")
print(inputs)
print("----")
model.eval()
with torch.no_grad():
    outputs = model(**inputs, labels=inputs["input_ids"])

logits = outputs.logits
print(logits)

{'input_ids': tensor([[15496,    11,   616,  3290,   318,   220]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
----
tensor([[[ -35.2362,  -35.3266,  -38.9753,  ...,  -44.4645,  -43.9974,
           -36.4580],
         [-112.6171, -114.5831, -116.5724,  ..., -119.0128, -118.8059,
          -111.6917],
         [ -88.7435,  -89.8643,  -93.1977,  ...,  -92.3839,  -96.1782,
           -92.1273],
         [ -85.1646,  -88.3379,  -92.8703,  ...,  -99.8017,  -94.7657,
           -90.9330],
         [-116.7280, -119.3950, -121.7259,  ..., -129.1003, -124.6102,
          -121.6092],
         [ -61.9847,  -63.7082,  -65.6898,  ...,  -76.0924,  -71.7898,
           -66.1154]]])


For benchmarking purposes, we will employ a helper function `gpt2_inference` which executes the inference on a single batch repeatedly and measures end to end execution time. Let's take note of this execution time for later comparison with TensorRT. 
 
`TimingProfile` is a named tuple that specifies the number of experiments and number of times to call the function per iteration (and number of warm-up calls although it is not used here).

In [7]:
from HuggingFace.GPT2.measurements import gpt2_inference
from HuggingFace.NNDF.networks import TimingProfile

# Benchmarking TensorRT performance on single batch
output, decoder_e2e_median_time = gpt2_inference(
    model.to("cuda:0"), inputs.input_ids.to("cuda:0"), TimingProfile(iterations=10, number=1, warmup=1)
)
decoder_e2e_median_time

0.010031929000433593

#### Open-end text generation
Next, we will employ the PyTorch model for the open-end text generation task, which GPT-2 is particularly good at. 

In [8]:
from HuggingFace.GPT2.GPT2ModelConfig import GPT2ModelTRTConfig

sample_output = model.to("cuda:0").generate(
    inputs.input_ids.to("cuda:0"),
    max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH["gpt2"],
    num_beams=5,
    num_return_sequences=3,
    do_sample=True,
)

# de-tokenize model output to raw text
for s in sample_output:
    print(tokenizer.decode(s, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  next_indices = next_tokens // vocab_size


Hello, my dog is icky and I don't know how to deal with it. I know it's a pain in the ass but I can't take it anymore. I'm going to have to go to the vet to see if I can get rid of it. I'm going to have to take it to
Hello, my dog is icky. I'm not going to tell you what to do. I'm not going to tell you what to do. I'm not going to tell you what to do. I'm not going to tell you what to do. I'm not going to tell you what to do. I
Hello, my dog is icky.

I don't know about you, but I'm not sure what to do.

I don't know what to do.

I don't know what to do.

I don't know what to do.

I don't know what to


In [9]:
type(model)

transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel

For benchmarking purposes, we will employ a helper function `full_inference_greedy` which executes the inference repeatedly and measures end to end execution time. Let's take note of this execution time for later comparison with TensorRT. 
 
TimingProfile is a named tuple that specifies the number of experiments and number of times to call the function per iteration (and number of warm-up calls although it is not used here).

In [10]:
from HuggingFace.GPT2.measurements import full_inference_greedy

# get complete decoder inference result and its timing profile
sample_output, full_e2e_median_runtime = full_inference_greedy(
    model.to("cuda:0"),
    inputs.input_ids,
    TimingProfile(iterations=10, number=1, warmup=1),
    max_length=64,
)
full_e2e_median_runtime

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

0.5501147425002273

<a id="2"></a>

## 2. Convert to ONNX format

Prior to converting the model to a TensorRT engine, we will first convert the PyTorch model to an intermediate universal format: ONNX.

ONNX is an open format for machine learning and deep learning models. It allows you to convert deep learning and machine learning models from different frameworks such as TensorFlow, PyTorch, MATLAB, Caffe, and Keras to a single format.

At a high level, the steps to convert a PyTorch model to TensorRT are as follows:
- Convert the pretrained image segmentation PyTorch model into ONNX.
- Import the ONNX model into TensorRT.
- Apply optimizations and generate an engine.
- Perform inference on the GPU with the TensorRT engine.

In [11]:
tokenizer = GPT2Tokenizer.from_pretrained(GPT2_VARIANT)

In [12]:
input_ids: BatchEncoding = tokenizer(
    "Here is some text to encode Hello World", add_special_tokens=True, return_tensors="pt"
)
print(type(input_ids))

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [13]:
model.to("cpu")
model.eval()
with torch.no_grad():
    print(model(**input_ids))

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[ -34.3027,  -33.9891,  -37.5683,  ...,  -42.6734,  -42.0399,
           -34.6136],
         [ -83.3065,  -82.9769,  -86.1204,  ...,  -89.8062,  -89.4546,
           -83.6084],
         [ -91.4901,  -92.5655,  -95.6423,  ...,  -96.6183,  -98.1545,
           -91.5266],
         ...,
         [ -92.8820,  -94.8433,  -98.9224,  ..., -101.4426, -103.2702,
           -95.7642],
         [ -72.6140,  -76.3407,  -79.7973,  ...,  -87.3300,  -85.7930,
           -77.7521],
         [-103.6147, -108.7898, -109.6276,  ..., -116.8557, -116.5565,
          -107.4467]]]), past_key_values=((tensor([[[[-1.2580,  1.5852,  1.0896,  ..., -1.5187, -0.0358,  1.1204],
          [-1.8348,  2.4955,  1.7497,  ..., -1.5397, -2.3685,  2.4482],
          [-2.3188,  2.1258,  1.6742,  ..., -0.6896, -1.4082,  1.8576],
          ...,
          [-1.7020,  2.4332,  1.0700,  ..., -1.6933, -0.7572,  0.9417],
          [-2.1612,  1.8802,  0.7015,  ..., -0.2824,

In [14]:
class GPTExportOnnx(Module):
    def __init__(self, model: GPT2LMHeadModel):
        super().__init__()
        self.transformer = model.transformer
        self.lm_head = model.lm_head
        self.config = model.config
        self.device = model.device

    def forward(self, input_ids, **_):
        transformer_outputs = self.transformer(input_ids=input_ids)
        hidden_states = transformer_outputs[0]
        logits = self.lm_head(hidden_states)
        return CausalLMOutputWithCrossAttentions(logits=logits)

In [15]:
gpt2_model = GPTExportOnnx(model=model)
model.eval()
with torch.no_grad():
    print(gpt2_model(inputs.input_ids.to("cpu")))

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[ -35.2362,  -35.3266,  -38.9753,  ...,  -44.4645,  -43.9974,
           -36.4580],
         [-112.6171, -114.5831, -116.5724,  ..., -119.0128, -118.8059,
          -111.6917],
         [ -88.7435,  -89.8643,  -93.1977,  ...,  -92.3839,  -96.1782,
           -92.1273],
         [ -85.1646,  -88.3379,  -92.8703,  ...,  -99.8017,  -94.7657,
           -90.9330],
         [-116.7280, -119.3950, -121.7259,  ..., -129.1003, -124.6102,
          -121.6092],
         [ -61.9847,  -63.7082,  -65.6898,  ...,  -76.0924,  -71.7898,
           -66.1154]]]), past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)


In [16]:
input_ids: BatchEncoding = tokenizer(
    "Here is some text to encode Hello World", add_special_tokens=True, return_attention_mask=False, return_tensors="pt"
)
for k, v in input_ids.items():  # type: str, torch.Tensor
    if v.dtype == torch.int64:
        input_ids[k] = v.type(torch.int32)


gpt2_model = GPTExportOnnx(model=model)
convert_to_onnx(
    model_pytorch=gpt2_model,
    output_path="test-gpt2.onnx",
    inputs_pytorch=dict(input_ids),
    quantization=False,
    var_output_seq=True,
)

_ = model.eval()

  attn_weights = attn_weights / (float(value.size(-1)) ** 0.5)


In [17]:
import logging

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
num_attention_heads, hidden_size = get_model_size(path=GPT2_VARIANT)
optimize_onnx(
    onnx_path="test-gpt2.onnx",
    onnx_optim_model_path="test-gpt2-opt.onnx",
    fp16=True,
    use_cuda=True,
    num_attention_heads=num_attention_heads,
    hidden_size=hidden_size,
    architecture="gpt2",
)

INFO:fusion_base:Fused LayerNormalization count: 25
INFO:fusion_base:Fused FastGelu count: 12
INFO:fusion_utils:Remove reshape node Reshape_9 since its input shape is same as output: ['batch_size', 'sequence']
INFO:fusion_utils:Remove reshape node Reshape_19 since its input shape is same as output: [1, 'sequence']
INFO:fusion_utils:Remove reshape node Reshape_2700 since its input shape is same as output: ['batch_size', 'sequence', 768]
INFO:onnx_model:Graph pruned: 0 inputs, 0 outputs and 23 nodes are removed
INFO:onnx_model:Graph pruned: 0 inputs, 0 outputs and 864 nodes are removed
INFO:onnx_model_gpt2:postprocess: remove Reshape count:72
INFO:fusion_base:Fused FastGelu(add bias) count: 12
INFO:onnx_model_bert:opset verion: 13
INFO:onnx_model_bert:Optimized operators:{'EmbedLayerNormalization': 0, 'Attention': 0, 'Gelu': 0, 'FastGelu': 12, 'BiasGelu': 0, 'LayerNormalization': 25, 'SkipLayerNormalization': 0}
INFO:root:optimizations applied: {'EmbedLayerNormalization': 0, 'Attention':

In [18]:
trt_logger: Logger = trt.Logger(trt.Logger.INFO)
runtime: Runtime = trt.Runtime(trt_logger)
if False:
    engine: ICudaEngine = build_engine(
        runtime=runtime,
        onnx_file_path="test-gpt2.onnx",
        logger=trt_logger,
        min_shape=(2, 1),
        optimal_shape=(2, 128),  # num beam -> batch size
        max_shape=(2, 384),  # num beam -> batch size
        workspace_size=12000 * 1024 * 1024,
        fp16=True,
        int8=False,
    )
    save_engine(engine, "test-gpt2.plan")

[01/23/2022-00:29:53] [TRT] [I] [MemUsageChange] Init CUDA: CPU +451, GPU +0, now: CPU 6060, GPU 10080 (MiB)


In [21]:
from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
from onnxruntime import InferenceSession, IOBinding, OrtValue, SessionOptions


class GPTModelWrapper(Module, GenerationMixin):
    def __init__(
        self, config: PretrainedConfig, device: torch.device, inference: Callable[[torch.Tensor], torch.Tensor]
    ):
        super().__init__()
        self.config: PretrainedConfig = config
        self.device: torch.device = device
        self.inference: Callable[[torch.Tensor], torch.Tensor] = inference
        self.infer_time = list()
        self.to(device=device)

    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        return {
            "input_ids": input_ids,
        }

    def forward(self, input_ids, **_):
        start = time.time()
        logits = self.inference(input_ids)
        self.infer_time.append(time.time() - start)
        return CausalLMOutputWithCrossAttentions(logits=logits)

    def timing(self) -> float:
        return np.sum(self.infer_time)


inputs = tokenizer(
    "Here is some text to encode Hello World",
    add_special_tokens=True,
    return_attention_mask=False,
    return_tensors="pt",
)

model_onnx = create_model_for_provider(path="test-gpt2-opt.onnx", provider_to_use="CUDAExecutionProvider")
# to avoid error message or passing some args to each generate call
model.config.pad_token_id = tokenizer.eos_token_id
model.config.use_cache = True


def list_np_to_tensor(data: np.ndarray) -> torch.Tensor:
    np_data = np.array(data)
    return torch.squeeze(torch.from_numpy(np_data), dim=0)


def inference_onnx(input_ids: torch.Tensor) -> torch.Tensor:
    data = {"input_ids": input_ids.detach().cpu().numpy().astype(np.int32)}
    logit = model_onnx.run(None, data)
    return list_np_to_tensor(data=logit)


def inference_torch(input_ids: torch.Tensor) -> torch.Tensor:
    transformer_outputs: BaseModelOutputWithPastAndCrossAttentions = model.transformer(input_ids=input_ids)
    return model.lm_head(transformer_outputs.last_hidden_state)


def inference_onnx_binding(input_ids: torch.Tensor) -> torch.Tensor:
    device_name = "cuda"
    input_ids = input_ids.type(torch.int32).to(device_name)  # int32 mandatory as input of bindings, int64 not supported
    binding: IOBinding = model_onnx.io_binding()
    binding.bind_input(
        name="input_ids",
        device_type=device_name,
        device_id=0,
        element_type=np.int32,
        shape=tuple(input_ids.shape),
        buffer_ptr=input_ids.data_ptr(),
    )

    batch_size, nb_tokens = tuple(input_ids.shape)
    output = torch.empty((batch_size, nb_tokens, tokenizer.vocab_size), dtype=torch.float32, device=device_name)
    binding.bind_output(
        name=model_onnx.get_outputs()[0].name,
        device_type=device_name,
        device_id=0,
        element_type=np.float32,
        shape=tuple(output.shape),
        buffer_ptr=output.data_ptr(),
    )

    model_onnx.run_with_iobinding(binding)
    return output


tensorrt_model: Callable[[Dict[str, torch.Tensor]], torch.Tensor] = load_engine(
    engine_file_path="test-gpt2.plan", runtime=runtime
)


def inference_tensorrt(input_ids: torch.Tensor) -> torch.Tensor:
    data = {"input_ids": input_ids}
    return tensorrt_model(data)[0]


gpt2_model = GPTModelWrapper(config=model.config, device=torch.device("cpu"), inference=inference_onnx)
inputs.to("cpu")
sample_output = gpt2_model.generate(inputs.input_ids, max_length=64, num_beams=2)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
for _ in range(2):
    _ = gpt2_model.generate(inputs.input_ids, max_length=64, num_beams=2)
gpt2_model.infer_time.clear()
start = time.time()
for _ in range(10):
    _ = gpt2_model.generate(inputs.input_ids, max_length=256, num_beams=2, use_cache=False)
print(f"onnx: {(time.time() - start)/60:.2f}")
print(f"infer timing: {gpt2_model.timing()/60:.2f}")
print(f"# inf: {len(gpt2_model.infer_time)}")


gpt2_model = GPTModelWrapper(config=model.config, device=torch.device("cuda"), inference=inference_onnx_binding)
inputs.to("cuda")
sample_output = gpt2_model.generate(inputs.input_ids, max_length=64, num_beams=2)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
for _ in range(2):
    _ = gpt2_model.generate(inputs.input_ids, max_length=64, num_beams=2)
gpt2_model.infer_time.clear()
start = time.time()
for _ in range(10):
    _ = gpt2_model.generate(inputs.input_ids, max_length=256, num_beams=2, use_cache=False)
print(f"onnx binding: {(time.time() - start)/60:.2f}")
print(f"infer timing: {gpt2_model.timing()/60:.2f}")
print(f"# inf: {len(gpt2_model.infer_time)}")
del model_onnx


model.cuda()
model.eval()
inputs.to("cuda")
with torch.inference_mode():
    gpt2_model = GPTModelWrapper(config=model.config, device=model.device, inference=inference_torch)
    sample_output = gpt2_model.generate(inputs.input_ids, max_length=64, num_beams=2)
    print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
    for _ in range(2):
        _ = gpt2_model.generate(inputs.input_ids, max_length=64, num_beams=2)
    gpt2_model.infer_time.clear()
    start = time.time()
    for _ in range(10):
        _ = gpt2_model.generate(inputs.input_ids, max_length=256, num_beams=2, use_cache=True)
    print(f"torch: {(time.time() - start)/60:.2f}")
    print(f"infer timing: {gpt2_model.timing()/60:.2f}")
    print(f"# inf: {len(gpt2_model.infer_time)}")
_ = model.cpu()


gpt2_model = GPTModelWrapper(config=model.config, device=torch.device("cuda"), inference=inference_tensorrt)
inputs.to("cuda")
sample_output = gpt2_model.generate(inputs.input_ids, max_length=64, num_beams=2)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
for _ in range(2):
    _ = gpt2_model.generate(inputs.input_ids, max_length=64, num_beams=2)
gpt2_model.infer_time.clear()
start = time.time()
for _ in range(10):
    _ = gpt2_model.generate(inputs.input_ids, max_length=256, num_beams=2, use_cache=False)
print(f"tensorrt: {(time.time() - start)/60:.2f}")
print(f"infer timing: {gpt2_model.timing()/60:.2f}")
print(f"# inf: {len(gpt2_model.infer_time)}")

[01/23/2022-00:38:32] [TRT] [I] Loaded engine size: 843 MiB
[01/23/2022-00:38:32] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +1, GPU +8, now: CPU 10039, GPU 17208 (MiB)
[01/23/2022-00:38:32] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +420, now: CPU 0, GPU 1006 (MiB)
[01/23/2022-00:38:32] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 9195, GPU 17208 (MiB)
[01/23/2022-00:38:33] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +167, now: CPU 0, GPU 1173 (MiB)
Here is some text to encode Hello World:

Hello World

Hello World

Hello World

Hello World

Hello World

Hello World

Hello World

Hello World

Hello World

Hello World

Hello World

Hello World

Hello World

Hello
onnx: 1.30
infer timing: 1.24
# inf: 2480
Here is some text to encode Hello World.

Hello World

Hello World

Hello World

Hello World

Hello World

Hello World

Hello World

Hello Wor

In [None]:
from transformers import pipeline, TextGenerationPipeline

generator: TextGenerationPipeline = pipeline("text-generation", model="gpt2")
start = time.time()
for _ in range(10):
    with torch.inference_mode():
        print(
            generator(
                "Here is some text to encode Hello World",
                max_length=256,
                num_return_sequences=1,
                num_beams=2,
                pad_token_id=50256,
            )
        )
print(f"torch: {(time.time() - start)/60:.2f}")