# BERT with ONNX

In [1]:
import time
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import onnxruntime as ort

# Define the model name (using a DistilBERT classifier fine-tuned on SST-2)
model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()  # Set to evaluation mode

# Prepare sample input text and tokenized input
text = "This movie was absolutely fantastic! I loved every moment of it."
inputs = tokenizer(text, return_tensors="pt")



In [3]:
def benchmark_pytorch_inference(model, inputs, num_runs=100):
    # Warm-up runs
    with torch.no_grad():
        for _ in range(10):
            _ = model(**inputs)
    start_time = time.time()
    with torch.no_grad():
        for _ in range(num_runs):
            _ = model(**inputs)
    end_time = time.time()
    avg_time = (end_time - start_time) / num_runs
    return avg_time



### Pytorch inference benchmarking

In [4]:
pytorch_time = benchmark_pytorch_inference(model, inputs, num_runs=100)
print(f"PyTorch Inference Time: {pytorch_time:.6f} sec/run")

PyTorch Inference Time: 0.025096 sec/run


## Export the Model to ONNX ###


In [5]:
dummy_inputs = tokenizer(text, return_tensors="pt")
dummy_input_ids = dummy_inputs["input_ids"]
dummy_attention_mask = dummy_inputs["attention_mask"]

In [6]:
# Export the model to ONNX with dynamic axes for batch and sequence dimensions
torch.onnx.export(
    model,
    (dummy_input_ids, dummy_attention_mask),
    "distilbert_sst2.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "logits": {0: "batch_size"}
    },
    opset_version=11
)
print("✅ Model exported to ONNX!")

  mask, torch.tensor(torch.finfo(scores.dtype).min)


verbose: False, log level: Level.ERROR

✅ Model exported to ONNX!


In [7]:
# Create an ONNX Runtime session using CPU for consistency
session = ort.InferenceSession("distilbert_sst2.onnx", providers=['CPUExecutionProvider'])

# Convert the dummy inputs to NumPy arrays for ONNX
onnx_inputs = {
    "input_ids": dummy_input_ids.numpy(),
    "attention_mask": dummy_attention_mask.numpy()
}

In [8]:
def benchmark_onnx_inference(session, inputs_np, num_runs=100):
    # Warm-up
    for _ in range(10):
        _ = session.run(None, inputs_np)
    start_time = time.time()
    for _ in range(num_runs):
        _ = session.run(None, inputs_np)
    end_time = time.time()
    avg_time = (end_time - start_time) / num_runs
    return avg_time

## ONNX benchmarking

In [9]:
onnx_time = benchmark_onnx_inference(session, onnx_inputs, num_runs=100)
print(f"ONNX Inference Time: {onnx_time:.6f} sec/run")


ONNX Inference Time: 0.006174 sec/run


In [10]:
# Compare the speeds
speedup = pytorch_time / onnx_time
print(f"Speedup (PyTorch / ONNX): {speedup:.2f}x faster")

Speedup (PyTorch / ONNX): 4.06x faster


In [11]:
session = ort.InferenceSession("distilbert_sst2.onnx", providers=['CPUExecutionProvider'])

def run_onnx_inference(text):
    # Tokenize the input text and get numpy arrays for inputs
    inputs = tokenizer(text, return_tensors="np")
    
    # ONNX Runtime expects the inputs as a dictionary mapping input names to numpy arrays.
    # Our exported model takes "input_ids" and "attention_mask".
    onnx_inputs = {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"]
    }
    
    # Run inference using ONNX Runtime
    outputs = session.run(None, onnx_inputs)
    logits = outputs[0]
    
    # Optional: apply softmax to get probabilities
    probs = np.exp(logits) / np.exp(logits).sum(axis=1, keepdims=True)
    return probs

### Example use

In [12]:
# Example usage:
text = "This movie was absolutely fantastic! I loved every moment of it."
probabilities = run_onnx_inference(text)
print("Probabilities:", probabilities)


Probabilities: [[1.1796260e-04 9.9988204e-01]]


In [15]:
predicted_index = np.argmax(probabilities, axis=1)[0]
label = "positive" if predicted_index == 1 else "negative"
print(label)

positive
