[Reference](https://medium.com/@claudia.yao2012/performance-and-accuracy-comparison-of-pytorch-models-using-torch-tensorrt-acceleration-f2d077bc85eb)

In [1]:
!pip install -U "nvidia_modelopt[hf]"
!pip install torch-tensorrt

Collecting nvidia_modelopt[hf]
  Downloading nvidia_modelopt-0.35.1-py3-none-any.whl.metadata (9.0 kB)
Collecting ninja (from nvidia_modelopt[hf])
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Collecting pulp (from nvidia_modelopt[hf])
  Downloading pulp-3.3.0-py3-none-any.whl.metadata (8.4 kB)
Collecting torchprofile>=0.0.4 (from nvidia_modelopt[hf])
  Downloading torchprofile-0.0.4-py3-none-any.whl.metadata (303 bytes)
Collecting transformers<4.56,>=4.48 (from nvidia_modelopt[hf])
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deepspeed>=0.9.6 (from nvidia_modelopt[hf])
  Downloading deepspeed-0.17.6.tar.gz (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... 

In [2]:
import time
import torch
from transformers import AutoTokenizer, AutoModel
import torch_tensorrt

# 1. Load model & tokenizer
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).eval().cuda()

# 2. Example batch of sentences
text = """ A good story encourages us to turn the next page and read more. We want to find out what happens next and what the main characters do and what they say to each other.
We may feel excited, sad, afraid, angry or really happy. This is because the experience of reading or listening to a story is much more likely to make us 'feel' that we are part
of the story, too. Just like in our 'real' lives, we might love or hate different characters in the story. Perhaps we recognise ourselves or others in some of them. Perhaps we
have similar problems. Because of this natural empathy with the characters, our brains process the reading of stories differently from the way we read factual information.
Our brains don't always recognise the difference between an imagined situation and a real one so the characters become 'alive' to us. What they say and do is therefore more meaningful.
This is why the words and structures that relate a story's events, descriptions and conversations are processed in this deeper way. In fact, cultures all around the world have always
used storytelling to pass knowledge from one generation to another. Our ancestors understood very well that this was the best way to make sure our histories and information about
how to relate to others and to our world was not only understood, but remembered too. (Notice that the word ‘history’ contains the word ‘story’ – More accurately, the word ‘story’
derives from ‘history’.) Encouraging your child to read or listen to stories should therefore help them to learn a second language in a way that is not only fun, but memorable.
Let's take a quick look at learning vocabulary within a factual text or within a story. Imagine the readers are eight-year-olds interested in animals. In your opinion, are they more
likely to remember AND want to continue reading the first or second text? """

texts = [item.strip() for item in text.split(".")][:16]  # adjust batch size here
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=32).to("cuda")

input_ids = inputs["input_ids"].to(torch.int32)
attention_mask = inputs["attention_mask"].to(torch.int32)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# 3. Baseline PyTorch inference
with torch.no_grad():
    start = time.time()
    baseline_outputs = model(input_ids, attention_mask)
    torch.cuda.synchronize()
    end = time.time()
    baseline_time = end - start
    print(f"PyTorch latency: {baseline_time:.4f} sec")

print("Output shape (PyTorch):", baseline_outputs.last_hidden_state.shape)

PyTorch latency: 0.6908 sec
Output shape (PyTorch): torch.Size([16, 32, 768])


In [4]:
# Convert model using Torch-TensorRT with enabled_precision of torch.float16
trt_model_float16 = torch_tensorrt.compile(
    model,
    inputs=[
        torch_tensorrt.Input(min_shape=[1, 32], opt_shape=[8, 32], max_shape=[16, 32], dtype=torch.int32),  # input_ids
        torch_tensorrt.Input(min_shape=[1, 32], opt_shape=[8, 32], max_shape=[16, 32], dtype=torch.int32),  # attention_mask
    ],
    enabled_precisions={torch.float16},
)
trt_inputs = {
    "input_ids": input_ids,
    "attention_mask": attention_mask
}

print("Convert to TensorRT float16.")


# Convert model using Torch-TensorRT with enabled_precision of torch.float32
trt_model_float32 = torch_tensorrt.compile(
    model,
    inputs=[
        torch_tensorrt.Input(min_shape=[1, 32], opt_shape=[8, 32], max_shape=[16, 32], dtype=torch.int32),  # input_ids
        torch_tensorrt.Input(min_shape=[1, 32], opt_shape=[8, 32], max_shape=[16, 32], dtype=torch.int32),  # attention_mask
    ],
    enabled_precisions={torch.float32},
)
trt_inputs_float32 = {
    "input_ids": input_ids,
    "attention_mask": attention_mask
}
print("Convert to TensorRT float32.")

W1001 14:47:29.384000 577 torch/fx/experimental/symbolic_shapes.py:6823] _maybe_guard_rel() was called on non-relation expression Eq(s43, 1) | Eq(s72, s43)


Convert to TensorRT float16.


W1001 14:49:11.875000 577 torch/fx/experimental/symbolic_shapes.py:6823] _maybe_guard_rel() was called on non-relation expression Eq(s43, 1) | Eq(s72, s43)


Convert to TensorRT float32.


In [5]:
# run model trt_model_float16
start = time.time()
trt_outputs_float16 = trt_model_float16(**trt_inputs)
torch.cuda.synchronize()
end = time.time()
trt_time_float16 = end - start
print(f"Torch-TensorRT latency: {trt_time_float16:.4f} sec")
print("Output shape (TensorRT):", trt_outputs_float16.last_hidden_state.shape)


# run model trt_model_float32
start = time.time()
trt_outputs_float32 = trt_model_float32(**trt_inputs_float32)
torch.cuda.synchronize()
end = time.time()
trt_time_float32 = end - start
print(f"Torch-TensorRT latency: {trt_time_float32:.4f} sec")

print("Output shape (TensorRT):", trt_outputs_float32.last_hidden_state.shape)

Torch-TensorRT latency: 0.0382 sec
Output shape (TensorRT): torch.Size([16, 32, 768])
Torch-TensorRT latency: 0.0446 sec
Output shape (TensorRT): torch.Size([16, 32, 768])


In [6]:
import torch
threshold = 0.01

diff_tensor = torch.abs(baseline_outputs.last_hidden_state - trt_outputs_float16.last_hidden_state)
max_diff_float16 = diff_tensor.max().item()
min_diff_float16 = diff_tensor.min().item()
percent_over_threshold_float16 = (diff_tensor > threshold).float().mean().item() * 100
print(f"Max absolute difference: {max_diff_float16}")
print(f"Min absolute difference: {min_diff_float16}")
print(f"Percentage of elements > {threshold}: {percent_over_threshold_float16:.3f}%")


diff_tensor = torch.abs(baseline_outputs.last_hidden_state - trt_outputs_float32.last_hidden_state)
max_diff_float32 = diff_tensor.max().item()
min_diff_float32 = diff_tensor.min().item()
percent_over_threshold_float32 = (diff_tensor > threshold).float().mean().item() * 100
print(f"Max absolute difference: {max_diff_float32}")
print(f"Min absolute difference: {min_diff_float32}")
print(f"Percentage of elements > {threshold}: {percent_over_threshold_float32:.3f}%")

Max absolute difference: 0.05847358703613281
Min absolute difference: 0.0
Percentage of elements > 0.01: 0.119%
Max absolute difference: 1.33514404296875e-05
Min absolute difference: 0.0
Percentage of elements > 0.01: 0.000%
