In [None]:
!python -m pip install --upgrade pip
!pip install torch torchvision torchaudio
!pip install -U "nvidia_modelopt[hf]"
!pip install onnx
!pip install onnxruntime-gpu

In [None]:
!pip install transformers

In [None]:
import time
import torch
from transformers import AutoTokenizer, AutoModel

# 1. Load model & tokenizer
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).eval().cuda()

# 2. Example batch of sentences
text = """ A good story encourages us to turn the next page and read more. We want to find out what happens next and what the main characters do and what they say to each other.
We may feel excited, sad, afraid, angry or really happy. This is because the experience of reading or listening to a story is much more likely to make us 'feel' that we are part
of the story, too. Just like in our 'real' lives, we might love or hate different characters in the story. Perhaps we recognise ourselves or others in some of them. Perhaps we
have similar problems. Because of this natural empathy with the characters, our brains process the reading of stories differently from the way we read factual information.
Our brains don't always recognise the difference between an imagined situation and a real one so the characters become 'alive' to us. What they say and do is therefore more meaningful.
This is why the words and structures that relate a story's events, descriptions and conversations are processed in this deeper way. In fact, cultures all around the world have always
used storytelling to pass knowledge from one generation to another. Our ancestors understood very well that this was the best way to make sure our histories and information about
how to relate to others and to our world was not only understood, but remembered too. (Notice that the word ‘history’ contains the word ‘story’ – More accurately, the word ‘story’
derives from ‘history’.) Encouraging your child to read or listen to stories should therefore help them to learn a second language in a way that is not only fun, but memorable.
Let's take a quick look at learning vocabulary within a factual text or within a story. Imagine the readers are eight-year-olds interested in animals. In your opinion, are they more
likely to remember AND want to continue reading the first or second text? """

texts = [item.strip() for item in text.split(".")][:16]  # adjust batch size here
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=32).to("cuda")

input_ids = inputs["input_ids"].to(torch.int32)
attention_mask = inputs["attention_mask"].to(torch.int32)


In [None]:
# 3. Baseline PyTorch inference
with torch.no_grad():
    start = time.time()
    baseline_outputs = model(input_ids, attention_mask)
    torch.cuda.synchronize()
    end = time.time()
    baseline_time = end - start
    print(f"PyTorch latency: {baseline_time:.4f} sec")

print("Output shape (PyTorch):", baseline_outputs.last_hidden_state.shape)

In [None]:
# save to onnx
output_onnx_file = "roberta.onnx"
torch.onnx.export(
    model.float(),
    (input_ids, attention_mask),
    output_onnx_file,
    input_names=["input_ids", "attention_mask"],
    output_names=["logits", "other"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "seq_len"},
        "attention_mask": {0: "batch_size", 1: "seq_len"},
        "logits": {0: "batch_size", 1: "seq_len"},
        "other": {0: "batch_size"}
    },
    opset_version=17
)

print("save to onnx file:", output_onnx_file)

In [None]:
import onnxruntime as ort
import numpy as np

def run_onnx(onnx_model_pth, provider: str):
    input_ids_np = inputs["input_ids"].to(torch.int32).cpu().numpy()
    attention_mask_np = inputs["attention_mask"].to(torch.int32).cpu().numpy()

    # 4. Onnx inference
    sess = ort.InferenceSession(onnx_model_pth, providers=[provider])

    # Check available providers
    print("Available providers:", ort.get_available_providers())
    print("Current provider:", sess.get_providers())

    start = time.time()
    onnx_outputs = sess.run(None, {"input_ids": input_ids_np, "attention_mask": attention_mask_np})
    end = time.time()
    onnx_time = end - start
    print(f"onnx runtime latency: {onnx_time:.4f} sec")
    return onnx_outputs

def measure_numeric_diff(onnx_tensor, pytorch_base_tensor):
    diff = np.abs(pytorch_base_tensor - onnx_tensor)
    diff_mean = diff.mean()
    diff_max = diff.max()
    diff_percent = (diff > 0.1).mean() * 100

    print("Mean absolute difference:", diff_mean)
    print("Max absolute difference:", diff_max)
    print("Percentage of values with diff > 0.01:", diff_percent, "%")

In [None]:
onnx_outputs = run_onnx("roberta.onnx", "CUDAExecutionProvider")

onnx_tensor = onnx_outputs[0]  # for last_hidden_state
pytorch_base_tensor = baseline_outputs.last_hidden_state.cpu().numpy()
measure_numeric_diff(onnx_tensor, pytorch_base_tensor)

In [None]:
onnx_outputs_cpu = run_onnx("roberta.onnx", "CPUExecutionProvider")

onnx_tensor_cpu = onnx_outputs_cpu[0]  # for last_hidden_state
pytorch_base_tensor = baseline_outputs.last_hidden_state.cpu().numpy()
measure_numeric_diff(onnx_tensor_cpu, pytorch_base_tensor)

In [None]:
!apt-get install -y build-essential cmake
!pip install --upgrade pip setuptools wheel
!pip install onnxsim --use-pep517 --no-build-isolation



In [None]:
!onnxsim roberta.onnx roberta_simplified.onnx

In [None]:
onnx_outputs_cpu = run_onnx("roberta_simplified.onnx", "CUDAExecutionProvider")