In [None]:
#lets convert a popular bert similairty embedding pre train models


!pip install transformers
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 48.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 31.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85

In [None]:
from sentence_transformers import SentenceTransformer,util

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):

    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    print(token_embeddings)
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')

#

In [None]:
# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'This is sample of the sentence']



import time
start = time.time()



encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
cosine_scores = util.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[1])
cosine_scores




end = time.time()

print(end - start)
print(f"pytorch vanilla cpu: {(end- start)/2:.2f}s/sequence")

tensor([[[ 0.0519, -0.0301,  0.2138,  ...,  0.1786,  0.0573, -0.0119],
         [-0.1315,  0.6603,  0.1879,  ...,  0.1245,  0.7793,  0.3559],
         [-0.3818,  0.8268, -0.1493,  ...,  0.5605,  0.6444,  0.0144],
         ...,
         [ 0.3050,  0.1931,  0.5416,  ...,  0.2212, -0.1725, -1.3564],
         [ 0.0519, -0.0301,  0.2138,  ...,  0.1786,  0.0573, -0.0119],
         [ 0.0519, -0.0301,  0.2138,  ...,  0.1786,  0.0573, -0.0119]],

        [[ 0.1090,  0.1189,  0.0785,  ...,  0.1729, -0.0939,  0.1119],
         [-0.0034,  1.0699,  0.3069,  ...,  0.0838,  0.6371,  0.5188],
         [ 0.0979,  0.6687, -0.0734,  ...,  0.3327,  0.6669,  0.1522],
         ...,
         [ 0.2591,  0.7813, -0.0958,  ...,  0.0458,  0.2115,  0.4994],
         [ 0.3489,  0.2220,  0.3398,  ...,  0.0702, -0.4704, -1.0491],
         [ 0.1091,  0.1189,  0.0785,  ...,  0.1729, -0.0939,  0.1119]]])
0.04085946083068848
pytorch vanilla cpu: 0.02s/sequence


In [None]:
torch.onnx.export(
    model, 
    tuple(encoded_input.values()),
    f="torch-model.onnx",  
    input_names=['input_ids', 'attention_mask','token_type_ids'], 
    output_names=['logits'], 
    dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'}, 
                  'attention_mask': {0: 'batch_size', 1: 'sequence'}, 
                  'token_type_ids': {0: 'batch_size', 1: 'sequence'},
                  'logits': {0: 'batch_size', 1: 'sequence'}}, 
    do_constant_folding=True, 
    opset_version=13, 
)

In [None]:
!pip install onnxruntime 
!pip install sentencepiece

In [None]:
import onnxruntime
import time
ort_session = onnxruntime.InferenceSession("torch-model.onnx", providers=["CPUExecutionProvider"])

def to_numpy(tensor):
    return tensor.detach.cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

def run_inference(input):
  tokenei= tokenizer(input, padding=True, truncation=True,return_tensors="pt")
  attention_mask = tokenei['attention_mask']
  tokenei['input_ids'] =[to_numpy(x) for x in tokenei['input_ids']]
  tokenei['attention_mask'] =[to_numpy(x) for x in tokenei['attention_mask']]
  tokenei['token_type_ids'] =[to_numpy(x) for x in tokenei['token_type_ids']]
  ort_outs = ort_session.run(['logits'], dict(tokenei))

  
  return (ort_outs), attention_mask



start = time.time()

output,attention_mask = run_inference(sentences)

red = torch.Tensor(output)
# Perform pooling
sentence_embeddings = mean_pooling(model_output, attention_mask)

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
cosine_scores = util.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[1])
cosine_scores



end = time.time()

print(end - start)
print(f"onnx cpu: {(end- start)/2:.2f}s/sequence")

In [None]:
start = time.time()

output,attention_mask = run_inference(sentences)

red = torch.Tensor(output)
# Perform pooling
sentence_embeddings = mean_pooling(model_output, attention_mask)

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
cosine_scores = util.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[1])
cosine_scores



end = time.time()

print(end - start)
print(f"onnx cpu: {(end- start)/2:.2f}s/sequence")

tensor([[[ 0.0519, -0.0301,  0.2138,  ...,  0.1786,  0.0573, -0.0119],
         [-0.1315,  0.6603,  0.1879,  ...,  0.1245,  0.7793,  0.3559],
         [-0.3818,  0.8268, -0.1493,  ...,  0.5605,  0.6444,  0.0144],
         ...,
         [ 0.3050,  0.1931,  0.5416,  ...,  0.2212, -0.1725, -1.3564],
         [ 0.0519, -0.0301,  0.2138,  ...,  0.1786,  0.0573, -0.0119],
         [ 0.0519, -0.0301,  0.2138,  ...,  0.1786,  0.0573, -0.0119]],

        [[ 0.1090,  0.1189,  0.0785,  ...,  0.1729, -0.0939,  0.1119],
         [-0.0034,  1.0699,  0.3069,  ...,  0.0838,  0.6371,  0.5188],
         [ 0.0979,  0.6687, -0.0734,  ...,  0.3327,  0.6669,  0.1522],
         ...,
         [ 0.2591,  0.7813, -0.0958,  ...,  0.0458,  0.2115,  0.4994],
         [ 0.3489,  0.2220,  0.3398,  ...,  0.0702, -0.4704, -1.0491],
         [ 0.1091,  0.1189,  0.0785,  ...,  0.1729, -0.0939,  0.1119]]])
0.029510021209716797
onnx cpu: 0.01s/sequence


In [None]:
#onnx optimization


!pip3 install onnxoptimizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting onnxoptimizer
  Downloading onnxoptimizer-0.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (602 kB)
[K     |████████████████████████████████| 602 kB 5.1 MB/s 
[?25hCollecting onnx
  Downloading onnx-1.12.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[K     |████████████████████████████████| 13.1 MB 43.5 MB/s 
Installing collected packages: onnx, onnxoptimizer
Successfully installed onnx-1.12.0 onnxoptimizer-0.3.1


onnx optimization

In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType

model_fp32 = 'torch-model.onnx'
model_quant = 'model.quant.onnx'
quantized_model = quantize_dynamic(model_fp32, model_quant)

In [None]:
import onnxruntime
import time
ort_session = onnxruntime.InferenceSession("model.quant.onnx", providers=["CPUExecutionProvider"])

def to_numpy(tensor):
    return tensor.detach.cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

def run_inference(input):
  tokenei= tokenizer(input, padding=True, truncation=True,return_tensors="pt")
  attention_mask = tokenei['attention_mask']
  tokenei['input_ids'] =[to_numpy(x) for x in tokenei['input_ids']]
  tokenei['attention_mask'] =[to_numpy(x) for x in tokenei['attention_mask']]
  tokenei['token_type_ids'] =[to_numpy(x) for x in tokenei['token_type_ids']]
  ort_outs = ort_session.run(['logits'], dict(tokenei))

  
  return (ort_outs), attention_mask








start = time.time()

output,attention_mask = run_inference(sentences)

red = torch.Tensor(output)
# Perform pooling
sentence_embeddings = mean_pooling(model_output, attention_mask)

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
cosine_scores = util.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[1])
cosine_scores



end = time.time()

print(end - start)
print(f"onnx cpu: {(end- start)/2:.2f}s/sequence")

tensor([[[ 0.0519, -0.0301,  0.2138,  ...,  0.1786,  0.0573, -0.0119],
         [-0.1315,  0.6603,  0.1879,  ...,  0.1245,  0.7793,  0.3559],
         [-0.3818,  0.8268, -0.1493,  ...,  0.5605,  0.6444,  0.0144],
         ...,
         [ 0.3050,  0.1931,  0.5416,  ...,  0.2212, -0.1725, -1.3564],
         [ 0.0519, -0.0301,  0.2138,  ...,  0.1786,  0.0573, -0.0119],
         [ 0.0519, -0.0301,  0.2138,  ...,  0.1786,  0.0573, -0.0119]],

        [[ 0.1090,  0.1189,  0.0785,  ...,  0.1729, -0.0939,  0.1119],
         [-0.0034,  1.0699,  0.3069,  ...,  0.0838,  0.6371,  0.5188],
         [ 0.0979,  0.6687, -0.0734,  ...,  0.3327,  0.6669,  0.1522],
         ...,
         [ 0.2591,  0.7813, -0.0958,  ...,  0.0458,  0.2115,  0.4994],
         [ 0.3489,  0.2220,  0.3398,  ...,  0.0702, -0.4704, -1.0491],
         [ 0.1091,  0.1189,  0.0785,  ...,  0.1729, -0.0939,  0.1119]]])
0.01838827133178711
onnx cpu: 0.01s/sequence


Onnx graph optmization

In [None]:
sess_options = onnxruntime.SessionOptions()

# Set graph optimization level
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED

# To enable model serialization after graph optimization set this
sess_options.optimized_model_filepath = "optimizegraphmodel.onnx"

ort_session = onnxruntime.InferenceSession("torch-model.onnx", sess_options,providers=["CPUExecutionProvider"])

def to_numpy(tensor):
    return tensor.detach.cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

def run_inference(input):
  tokenei= tokenizer(input, padding=True, truncation=True,return_tensors="pt")
  attention_mask = tokenei['attention_mask']
  tokenei['input_ids'] =[to_numpy(x) for x in tokenei['input_ids']]
  tokenei['attention_mask'] =[to_numpy(x) for x in tokenei['attention_mask']]
  tokenei['token_type_ids'] =[to_numpy(x) for x in tokenei['token_type_ids']]
  ort_outs = ort_session.run(['logits'], dict(tokenei))

  
  return (ort_outs), attention_mask








start = time.time()

output,attention_mask = run_inference(sentences)

red = torch.Tensor(output)
# Perform pooling
sentence_embeddings = mean_pooling(model_output, attention_mask)

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
cosine_scores = util.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[1])
cosine_scores



end = time.time()

print(end - start)
print(f"onnx cpu: {(end- start)/2:.2f}s/sequence")

tensor([[[ 0.0519, -0.0301,  0.2138,  ...,  0.1786,  0.0573, -0.0119],
         [-0.1315,  0.6603,  0.1879,  ...,  0.1245,  0.7793,  0.3559],
         [-0.3818,  0.8268, -0.1493,  ...,  0.5605,  0.6444,  0.0144],
         ...,
         [ 0.3050,  0.1931,  0.5416,  ...,  0.2212, -0.1725, -1.3564],
         [ 0.0519, -0.0301,  0.2138,  ...,  0.1786,  0.0573, -0.0119],
         [ 0.0519, -0.0301,  0.2138,  ...,  0.1786,  0.0573, -0.0119]],

        [[ 0.1090,  0.1189,  0.0785,  ...,  0.1729, -0.0939,  0.1119],
         [-0.0034,  1.0699,  0.3069,  ...,  0.0838,  0.6371,  0.5188],
         [ 0.0979,  0.6687, -0.0734,  ...,  0.3327,  0.6669,  0.1522],
         ...,
         [ 0.2591,  0.7813, -0.0958,  ...,  0.0458,  0.2115,  0.4994],
         [ 0.3489,  0.2220,  0.3398,  ...,  0.0702, -0.4704, -1.0491],
         [ 0.1091,  0.1189,  0.0785,  ...,  0.1729, -0.0939,  0.1119]]])
0.03601384162902832
onnx cpu: 0.02s/sequence
