# Accelerating GPT-2 model

### *(and any decoder based transformer models)*

In [1]:
import logging
import time
from typing import Callable, Dict

import numpy as np
import tensorrt as trt
import torch
from tensorrt import ICudaEngine
from tensorrt.tensorrt import Logger, Runtime
from torch.nn import Module
from transformers import AutoTokenizer, BatchEncoding, GPT2LMHeadModel, PretrainedConfig
from transformers.generation_utils import GenerationMixin
from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions

from transformer_deploy.backends.ort_utils import create_model_for_provider, inference_onnx_binding, optimize_onnx
from transformer_deploy.backends.pytorch_utils import convert_to_onnx, get_model_size
from transformer_deploy.backends.trt_utils import build_engine, load_engine, save_engine

## Load model

As a reminder:

* gpt2 (117M)
* gpt2-large (774M)

In [2]:
model_name = "gpt2"  # choices: gpt2 | gpt2-large

# use GPT2LMHeadModel to oupinstead of AutoModel
model: GPT2LMHeadModel = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
# to avoid error message or passing some args to each generate call
model.config.pad_token_id = tokenizer.eos_token_id

Output predictions for the next token. Those values will be used during the decoding part.

In [3]:
# carry out inference with a single sample
inputs = tokenizer("Hello, my dog is ", return_tensors="pt")
print(inputs)
print("----")

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
print(logits)

{'input_ids': tensor([[15496,    11,   616,  3290,   318,   220]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
----
tensor([[[ -35.2362,  -35.3266,  -38.9753,  ...,  -44.4645,  -43.9974,
           -36.4580],
         [-112.6171, -114.5831, -116.5724,  ..., -119.0128, -118.8059,
          -111.6917],
         [ -88.7435,  -89.8643,  -93.1977,  ...,  -92.3839,  -96.1782,
           -92.1273],
         [ -85.1646,  -88.3379,  -92.8703,  ...,  -99.8017,  -94.7657,
           -90.9330],
         [-116.7280, -119.3950, -121.7259,  ..., -129.1003, -124.6102,
          -121.6092],
         [ -61.9847,  -63.7082,  -65.6898,  ...,  -76.0924,  -71.7898,
           -66.1154]]])


Shape: [batch size, nb tokens, vocabulary size]

In [4]:
outputs.logits.shape

torch.Size([1, 6, 50257])

## Build ONNX graph

Prior to converting the model to a TensorRT engine, we will first convert the PyTorch model to an intermediate universal format: ONNX.

ONNX is an open format for machine learning and deep learning models. It allows you to convert deep learning and machine learning models from different frameworks such as TensorFlow, PyTorch, MATLAB, Caffe, and Keras to a single format.

At a high level, the steps to convert a PyTorch model to TensorRT are as follows:
- Convert the pretrained image segmentation PyTorch model into ONNX.
- Import the ONNX model into TensorRT.
- Apply optimizations and generate an engine.
- Perform inference on the GPU with the TensorRT engine.

In [5]:
input_ids: BatchEncoding = tokenizer(
    "Here is some text to encode Hello World", add_special_tokens=True, return_attention_mask=False, return_tensors="pt"
)
# some inference engines don't support int64 tensor as inputs, we convert all input tensors
for k, v in input_ids.items():  # type: str, torch.Tensor
    input_ids[k] = v.type(dtype=torch.int32)

convert_to_onnx(
    model_pytorch=model,
    output_path="test-gpt2.onnx",
    inputs_pytorch=dict(input_ids),
    quantization=False,
    var_output_seq=True,  # we inform ONNX export tool that the output shape will vary with the input shape
)

_ = model.eval()

  attn_weights = attn_weights / (float(value.size(-1)) ** 0.5)


### Optimize ONNX graph

In [6]:
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
num_attention_heads, hidden_size = get_model_size(path=model_name)
optimize_onnx(
    onnx_path="test-gpt2.onnx",
    onnx_optim_model_path="test-gpt2-opt.onnx",
    fp16=True,
    use_cuda=True,
    num_attention_heads=num_attention_heads,
    hidden_size=hidden_size,
    architecture="gpt2",
)

INFO:fusion_base:Fused LayerNormalization count: 25
INFO:fusion_base:Fused FastGelu count: 12
INFO:fusion_utils:Remove reshape node Reshape_9 since its input shape is same as output: ['batch_size', 'sequence']
INFO:fusion_utils:Remove reshape node Reshape_19 since its input shape is same as output: [1, 'sequence']
INFO:fusion_utils:Remove reshape node Reshape_2700 since its input shape is same as output: ['batch_size', 'sequence', 768]
INFO:onnx_model:Graph pruned: 0 inputs, 0 outputs and 23 nodes are removed
INFO:onnx_model:Graph pruned: 0 inputs, 0 outputs and 864 nodes are removed
INFO:onnx_model_gpt2:postprocess: remove Reshape count:72
INFO:fusion_base:Fused FastGelu(add bias) count: 12
INFO:onnx_model_bert:opset verion: 13
INFO:onnx_model_bert:Optimized operators:{'EmbedLayerNormalization': 0, 'Attention': 0, 'Gelu': 0, 'FastGelu': 12, 'BiasGelu': 0, 'LayerNormalization': 25, 'SkipLayerNormalization': 0}
INFO:root:optimizations applied: {'EmbedLayerNormalization': 0, 'Attention':

## Build TensorRT engine

In [10]:
from pathlib import Path

trt_logger: Logger = trt.Logger(trt.Logger.INFO)
runtime: Runtime = trt.Runtime(trt_logger)
trt_model_name = "test-gpt2.plan"

# create only of does not exist because it's slow to run...
if not Path(trt_model_name).exists():
    engine: ICudaEngine = build_engine(
        runtime=runtime,
        onnx_file_path="test-gpt2.onnx",
        logger=trt_logger,
        min_shape=(1, 1),
        optimal_shape=(1, 128),  # num beam -> batch size
        max_shape=(1, 384),  # num beam -> batch size
        workspace_size=12000 * 1024 * 1024,
        fp16=True,
        int8=False,
    )
    save_engine(engine, trt_model_name)

[01/27/2022-20:53:13] [TRT] [I] The logger passed into createInferRuntime differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.

[01/27/2022-20:53:13] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 2034, GPU 861 (MiB)
[01/27/2022-20:53:13] [TRT] [I] The logger passed into createInferBuilder differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.

[01/27/2022-20:53:13] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 2034, GPU 861 (MiB)
[01/27/2022-20:53:14] [TRT] [I] [MemUsageSnapshot] Begin constructing builder kernel library: CPU 2034 MiB, GPU 861 MiB
[01/27/2022-20:53:14] [TRT] [I] [MemUsageSnapshot] End constructing builder kernel library: CPU 2188 MiB, GPU 905 MiB




[01/27/2022-20:53:15] [TRT] [W] onnx2trt_utils.cpp:366: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[01/27/2022-20:53:15] [TRT] [W] onnx2trt_utils.cpp:392: One or more weights outside the range of INT32 was clamped
[01/27/2022-20:53:15] [TRT] [W] onnx2trt_utils.cpp:392: One or more weights outside the range of INT32 was clamped
[01/27/2022-20:53:15] [TRT] [W] ShapedWeights.cpp:173: Weights transformer.h.0.attn.c_attn.weight has been transposed with permutation of (1, 0)! If you plan on overwriting the weights with the Refitter API, the new weights must be pre-transposed.
[01/27/2022-20:53:15] [TRT] [W] onnx2trt_utils.cpp:392: One or more weights outside the range of INT32 was clamped
[01/27/2022-20:53:15] [TRT] [W] ShapedWeights.cpp:173: Weights transformer.h.0.attn.c_proj.weight has been transposed with permutation of (1, 0)! If you plan on overwriting the weights with the Refitter API, the ne

# Inference

## Wrappers

In [14]:
class GPTModelWrapper(Module, GenerationMixin):
    def __init__(
        self, config: PretrainedConfig, device: torch.device, inference: Callable[[torch.Tensor], torch.Tensor]
    ):
        super().__init__()
        self.config: PretrainedConfig = config
        self.device: torch.device = device
        self.inference: Callable[[torch.Tensor], torch.Tensor] = inference
        self.infer_time = list()
        self.to(device=device)

    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        return {
            "input_ids": input_ids,
        }

    def forward(self, input_ids, **_):
        start = time.time()
        logits = self.inference(input_ids)
        self.infer_time.append(time.time() - start)
        return CausalLMOutputWithCrossAttentions(logits=logits)

    def timing(self) -> float:
        return np.sum(self.infer_time)


inputs = tokenizer(
    "Here is some text to encode Hello World",
    add_special_tokens=True,
    return_attention_mask=False,
    return_tensors="pt",
)

## Pytorch inference

In [29]:
def inference_torch(input_ids: torch.Tensor) -> torch.Tensor:
    transformer_outputs: BaseModelOutputWithPastAndCrossAttentions = model.transformer(input_ids=input_ids)
    return model.lm_head(transformer_outputs.last_hidden_state)


model.cuda()
model.eval()
inputs.to("cuda")
with torch.inference_mode():
    gpt2_model = GPTModelWrapper(config=model.config, device=model.device, inference=inference_torch)
    sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)
    print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
    for _ in range(2):
        _ = gpt2_model.generate(inputs.input_ids, max_length=64)
    gpt2_model.infer_time.clear()
    start = time.time()
    for _ in range(10):
        _ = gpt2_model.generate(inputs.input_ids, max_length=256, use_cache=True)
    print(f"torch: {(time.time() - start)/60:.2f}")
    print(f"infer timing: {gpt2_model.timing()/60:.2f}")
    print(f"# inf: {len(gpt2_model.infer_time)}")
_ = model.cpu()

Here is some text to encode Hello World.

Hello World

Hello World is a simple program that takes a string and returns a string.

The program is written in C.

The program is written in C. The program is written in C. The program is written in C. The program
torch: 0.36
infer timing: 0.34
# inf: 2480


## Naive ONNX Runtime inference

In [30]:
model_onnx = create_model_for_provider(path="test-gpt2-opt.onnx", provider_to_use="CUDAExecutionProvider")


def inference_onnx_naive(input_ids: torch.Tensor) -> torch.Tensor:
    data = {"input_ids": input_ids.detach().cpu().numpy().astype(np.int32)}
    logit = model_onnx.run(None, data)
    np_logit = np.array(logit)
    return torch.squeeze(torch.from_numpy(np_logit), dim=0)


gpt2_model = GPTModelWrapper(config=model.config, device=torch.device("cpu"), inference=inference_onnx_naive)
inputs.to("cpu")
sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
for _ in range(2):
    _ = gpt2_model.generate(inputs.input_ids, max_length=64)
gpt2_model.infer_time.clear()
start = time.time()
for _ in range(10):
    _ = gpt2_model.generate(inputs.input_ids, max_length=256, use_cache=False)
print(f"onnx: {(time.time() - start)/60:.2f}")
print(f"infer timing: {gpt2_model.timing()/60:.2f}")
print(f"# inf: {len(gpt2_model.infer_time)}")

del model_onnx

Here is some text to encode Hello World.

Hello World

Hello World is a simple program that takes a string and returns a string.

The program is written in C.

The program is written in C. The program is written in C. The program is written in C. The program
onnx: 0.66
infer timing: 0.64
# inf: 2480


## Optimized ONNX Runtime inference

In [31]:
model_onnx = create_model_for_provider(path="test-gpt2-opt.onnx", provider_to_use="CUDAExecutionProvider")


def inference_onnx_optimized(input_ids: torch.Tensor) -> torch.Tensor:
    data = {"input_ids": input_ids}
    return inference_onnx_binding(model_onnx=model_onnx, inputs=data, device="cuda")["output"]


gpt2_model = GPTModelWrapper(config=model.config, device=torch.device("cuda"), inference=inference_onnx_optimized)
inputs.to("cuda")
sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
for _ in range(2):
    _ = gpt2_model.generate(inputs.input_ids, max_length=64)
gpt2_model.infer_time.clear()
start = time.time()
for _ in range(10):
    _ = gpt2_model.generate(inputs.input_ids, max_length=256, use_cache=False)
print(f"onnx binding: {(time.time() - start)/60:.2f}")
print(f"infer timing: {gpt2_model.timing()/60:.2f}")
print(f"# inf: {len(gpt2_model.infer_time)}")

del model_onnx

Here is some text to encode Hello World.

Hello World

Hello World is a simple program that takes a string and returns a string.

The program is written in C.

The program is written in C. The program is written in C. The program is written in C. The program
onnx binding: 0.15
infer timing: 0.13
# inf: 2480


## TensorRT Inference

In [36]:
tensorrt_model: Callable[[Dict[str, torch.Tensor]], torch.Tensor] = load_engine(
    engine_file_path="test-gpt2.plan", runtime=runtime
)


def inference_tensorrt(input_ids: torch.Tensor) -> torch.Tensor:
    data = {"input_ids": input_ids}
    return tensorrt_model(data)[0]


gpt2_model = GPTModelWrapper(config=model.config, device=torch.device("cuda"), inference=inference_tensorrt)
inputs.to("cuda")
sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
for _ in range(2):
    _ = gpt2_model.generate(inputs.input_ids, max_length=64)
gpt2_model.infer_time.clear()
start = time.time()
for _ in range(10):
    _ = gpt2_model.generate(inputs.input_ids, max_length=256, use_cache=False)
print(f"tensorrt: {(time.time() - start)/60:.2f}")
print(f"infer timing: {gpt2_model.timing()/60:.2f}")
print(f"# inf: {len(gpt2_model.infer_time)}")

del tensorrt_model

[01/24/2022-14:13:52] [TRT] [I] Loaded engine size: 978 MiB
[01/24/2022-14:13:52] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 15811, GPU 10528 (MiB)
[01/24/2022-14:13:52] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +487, now: CPU 0, GPU 1560 (MiB)
[01/24/2022-14:13:52] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 14832, GPU 10528 (MiB)
[01/24/2022-14:13:53] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +87, now: CPU 0, GPU 1647 (MiB)
Here is some text to encode Hello World.

Hello World

Hello World is a simple program that takes a string and returns a string.

The program is written in C.

The program is written in C. The program is written in C. The program is written in C. The program
tensorrt: 0.09
infer timing: 0.08
# inf: 2480
