In [1]:
!pip install transformers
!pip install onnxruntime-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 30.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.1 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

In [2]:
# base inference code
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

encoded_input = tokenizer(" ".join(["hello"]*510), return_tensors='pt')
output = model(**encoded_input)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [3]:
# base inference code performance
import time

start = time.time()
for _ in range(10):
  encoded_input = tokenizer(" ".join(["hello"]*510), return_tensors='pt')
  output = model(**encoded_input)
end = time.time()

print("cpu:", end-start)

cpu: 15.62102460861206


In [4]:
# performance with fast tokenizer
from transformers import BertTokenizerFast

fast_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

start = time.time()
for _ in range(10):
  encoded_input = fast_tokenizer(" ".join(["hello"]*510), return_tensors='pt')
  output = model(**encoded_input)
end = time.time()

print("cpu + fast tokenizer:", end-start)

In [6]:
# serialization with onnx
! python -m transformers.onnx --model=bert-base-uncased onnx

2022-07-10 14:48:45.223863: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceCla

In [7]:
# model loading with onnx 
import onnxruntime as ort

session = ort.InferenceSession("onnx/model.onnx", providers=['CPUExecutionProvider']) # using cpu

start = time.time()
for _ in range(10):
  encoded_input = fast_tokenizer(" ".join(["hello"]*510), return_tensors='np')
  temp = session.run(None, input_feed=dict(encoded_input))
end = time.time()

print("cpu + fast tokenizer + onnx:", end - start)

In [10]:
# model initializing with gpu
model_gpu = AutoModel.from_pretrained("bert-base-uncased").to("cuda")

# first infernece for initializing GPU
encoded_input = fast_tokenizer(" ".join(["hello"]*510), return_tensors='pt').to("cuda")
output = model_gpu(**encoded_input)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
# onnx gpu
session = ort.InferenceSession("onnx/model.onnx", providers=['CUDAExecutionProvider'])

# first inference for initializing gpu
encoded_input = fast_tokenizer(" ".join(["hello"]*510), return_tensors='np')
temp = session.run(None, input_feed=dict(encoded_input))

gpu + fast tokenizer: 0.3705315589904785


In [17]:
### max token length ###

# torch model without fast tokenizer
start = time.time()
for _ in range(10):
  encoded_input = tokenizer(" ".join(["hello"]*510), return_tensors='pt').to("cuda")
  output = model_gpu(**encoded_input)
end = time.time()
print("gpu base:", end-start)

# torch model
start = time.time()
for _ in range(10):
  encoded_input = fast_tokenizer(" ".join(["hello"]*510), return_tensors='pt').to("cuda")
  output = model_gpu(**encoded_input)
end = time.time()
print("gpu + fast tokenizer:", end-start)

# onnx
start = time.time()
for _ in range(10):
  encoded_input = fast_tokenizer(" ".join(["hello"]*510), return_tensors='np')
  temp = session.run(None, input_feed=dict(encoded_input))
end = time.time()

print("gpu + fast tokenizer + onnx:", end - start)

### short token length ###

# torch model
start = time.time()
for _ in range(10):
  encoded_input = fast_tokenizer("hello", return_tensors='pt').to("cuda")
  output = model_gpu(**encoded_input)
end = time.time()
print("short token length, gpu + fast tokenizer:", end-start)

# onnx model
start = time.time()
for _ in range(10):
  encoded_input = fast_tokenizer("hello", return_tensors='np')
  temp = session.run(None, input_feed=dict(encoded_input))
end = time.time()

print("short token length, gpu + fast tokenizer + onnx:", end - start)

gpu + fast tokenizer + onnx, max token length: 0.38298678398132324
