In [1]:
!pip install -r requirements.txt

Ignoring colorama: markers 'platform_system == "Windows" and python_full_version >= "3.6.0" and python_version >= "3.6"' don't match your environment
Ignoring numpy: markers 'python_version >= "3.8" and python_full_version >= "3.6.0"' don't match your environment
Ignoring pyreadline3: markers 'sys_platform == "win32" and python_version >= "3.8" and (python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0")' don't match your environment
Collecting click==8.0.4
  Downloading click-8.0.4-py3-none-any.whl (97 kB)
[K     |████████████████████████████████| 97 kB 4.3 MB/s 
[?25hCollecting coloredlogs==15.0.1
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[K     |████████████████████████████████| 46 kB 3.4 MB/s 
Collecting huggingface-hub==0.4.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.3 MB/s 
[?25hCollecting humanfriendly==10.0
  Downloading humanfriendly-10.0-py2.py3-n

In [5]:
from transformers.onnx.features import FeaturesManager
distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("bert").keys())
print(distilbert_features)

['default', 'masked-lm', 'causal-lm', 'sequence-classification', 'token-classification', 'question-answering']


In [6]:
!python -m transformers.onnx --model=dslim/bert-base-NER --feature=token-classification onnx/

Downloading: 100% 59.0/59.0 [00:00<00:00, 40.1kB/s]
Downloading: 100% 829/829 [00:00<00:00, 669kB/s]
Downloading: 100% 208k/208k [00:00<00:00, 661kB/s]
Downloading: 100% 2.00/2.00 [00:00<00:00, 1.56kB/s]
Downloading: 100% 112/112 [00:00<00:00, 85.1kB/s]
Downloading: 100% 413M/413M [00:10<00:00, 41.9MB/s]
Using framework PyTorch: 1.10.2+cu102
Overriding 1 configuration item(s)
	- use_cache -> False
Validating ONNX model...
	-[✓] ONNX model output names match reference model ({'logits'})
	- Validating ONNX Model output "logits":
		-[✓] (2, 8, 9) matches (2, 8, 9)
		-[✓] all values close (atol: 1e-05)
All good, model saved at: onnx/model.onnx


In [7]:
import torch

from onnxruntime import (
    InferenceSession, SessionOptions, GraphOptimizationLevel
)
from transformers import (
    TokenClassificationPipeline, AutoTokenizer, AutoModelForTokenClassification
)

In [9]:
options = SessionOptions() # initialize session options
options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

session = InferenceSession(
    "onnx/model.onnx", sess_options=options, providers=["CPUExecutionProvider"]
)

# disable session.run() fallback mechanism, it prevents for a reset of the execution provider
session.disable_fallback()

In [10]:
from onnxruntime import get_all_providers

get_all_providers()

['TensorrtExecutionProvider',
 'CUDAExecutionProvider',
 'MIGraphXExecutionProvider',
 'ROCMExecutionProvider',
 'OpenVINOExecutionProvider',
 'DnnlExecutionProvider',
 'NupharExecutionProvider',
 'VitisAIExecutionProvider',
 'NnapiExecutionProvider',
 'CoreMLExecutionProvider',
 'ArmNNExecutionProvider',
 'ACLExecutionProvider',
 'DmlExecutionProvider',
 'RknpuExecutionProvider',
 'CPUExecutionProvider']

In [11]:
class OnnxTokenClassificationPipeline(TokenClassificationPipeline):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    
    def _forward(self, model_inputs):
        """
        Forward pass through the model. This method is not to be called by the user directly and is only used
        by the pipeline to perform the actual predictions.

        This is where we will define the actual process to do inference with the ONNX model and the session created
        before.
        """

        # This comes from the original implementation of the pipeline
        special_tokens_mask = model_inputs.pop("special_tokens_mask")
        offset_mapping = model_inputs.pop("offset_mapping", None)
        sentence = model_inputs.pop("sentence")

        inputs = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()} # dict of numpy arrays
        outputs_name = session.get_outputs()[0].name # get the name of the output tensor

        logits = session.run(output_names=[outputs_name], input_feed=inputs)[0] # run the session
        logits = torch.tensor(logits) # convert to torch tensor to be compatible with the original implementation

        return {
            "logits": logits,
            "special_tokens_mask": special_tokens_mask,
            "offset_mapping": offset_mapping,
            "sentence": sentence,
            **model_inputs,
        }

    # We need to override the preprocess method because the onnx model is waiting for the attention masks as inputs
    # along with the embeddings.
    def preprocess(self, sentence, offset_mapping=None):
        truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
        model_inputs = self.tokenizer(
            sentence,
            return_attention_mask=True, # This is the only difference from the original implementation
            return_tensors=self.framework,
            truncation=truncation,
            return_special_tokens_mask=True,
            return_offsets_mapping=self.tokenizer.is_fast,
        )
        if offset_mapping:
            model_inputs["offset_mapping"] = offset_mapping

        model_inputs["sentence"] = sentence

        return model_inputs

In [17]:
model_name_from_hub = "dslim/bert-base-NER"

tokenizer = AutoTokenizer.from_pretrained(model_name_from_hub)
model = AutoModelForTokenClassification.from_pretrained(model_name_from_hub)

onnx_pipeline = OnnxTokenClassificationPipeline(
    task="ner", 
    model=model,
    tokenizer=tokenizer,
    framework="pt",
    aggregation_strategy="simple",
)

In [13]:
sequence = "Apple was founded in 1976 by Steve Jobs, Steve Wozniak and Ronald Wayne to develop and sell Wozniak's Apple I personal computer"

onnx_pipeline(sequence)

[{'end': 5,
  'entity_group': 'ORG',
  'score': 0.9978969,
  'start': 0,
  'word': 'Apple'},
 {'end': 39,
  'entity_group': 'PER',
  'score': 0.9981243,
  'start': 29,
  'word': 'Steve Jobs'},
 {'end': 54,
  'entity_group': 'PER',
  'score': 0.9741297,
  'start': 41,
  'word': 'Steve Wozniak'},
 {'end': 71,
  'entity_group': 'PER',
  'score': 0.99970996,
  'start': 59,
  'word': 'Ronald Wayne'},
 {'end': 99,
  'entity_group': 'PER',
  'score': 0.86664414,
  'start': 92,
  'word': 'Wozniak'},
 {'end': 109,
  'entity_group': 'MISC',
  'score': 0.99852806,
  'start': 102,
  'word': 'Apple I'}]

In [14]:
pytorch_pipeline = TokenClassificationPipeline(
    task="ner", 
    model=model,
    tokenizer=tokenizer,
    framework="pt",
    aggregation_strategy="simple",
)

In [15]:
sequences = {
    "short_sequence": "Hello my name is Thomas and I love HuggingFace.",
    "medium_sequence": "Winston Churchill was born in 1874 in Stoke-on-Trent, England, to a German father, William and Elizabeth Churchill.",
    "long_sequence": """The first person to reach the summit of Everest was the South Nepalese Everest Gurun, 
                who was a member of the Royal Nepal Expedition, led by the Nepalese Mountaineer, Sir Edmund Hillary. 
                Hilary lived in the Himalayas for a time. He sadly died in 1953 at the age of 88."""
}

In [18]:
import timeit

results = [["Sequence Length", "PyTorch", "ONNX"]]
for k, v in sequences.items():
    results.append(
        [k, timeit.timeit(lambda: pytorch_pipeline(v), number=300), timeit.timeit(lambda: onnx_pipeline(v), number=300)]
    )

In [19]:
from tabulate import tabulate

print(tabulate(results, headers="firstrow"))

Sequence Length      PyTorch     ONNX
-----------------  ---------  -------
short_sequence       31.3778  21.4062
medium_sequence      44.9581  30.7068
long_sequence        83.5834  58.7696


In [20]:
print(f"For a short sequence: ONNX is {results[1][1]/results[1][2]:.2f}x faster than PyTorch")
print(f"For a medium sequence: ONNX is {results[2][1]/results[2][2]:.2f}x faster than PyTorch")
print(f"For a long sequence: ONNX is {results[3][1]/results[3][2]:.2f}x faster than PyTorch")

For a short sequence: ONNX is 1.47x faster than PyTorch
For a medium sequence: ONNX is 1.46x faster than PyTorch
For a long sequence: ONNX is 1.42x faster than PyTorch
