In [9]:
# Some standard imports
import numpy as np

from torch import nn
import torch.onnx
from new_transformer import Transformer
import onnx
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic

In [10]:

device = torch.device("cpu")
model = Transformer(
    vocab_size=1210,
    n_head=6,
    embed_size=600,
    context_length=100,
    dropout=0.1,
    num_layers=6,
    device=device,
)
model.load_state_dict(torch.load("saved_model_tokenizer_3.pth"))
model.eval()

Transformer(
  (encoder): Encoder(
    (embed): Embedding(1210, 600)
    (pos_encoding): SinusoidEncoding()
    (dropout): Dropout(p=0.1, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x Block(
        (qkv): Linear(in_features=600, out_features=1800, bias=False)
        (mha): MultiHeadAttention(
          (fc_out): Linear(in_features=600, out_features=600, bias=False)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (fc_dropout): Dropout(p=0.1, inplace=False)
        )
        (ffwd): FeedForward(
          (0): Linear(in_features=600, out_features=2400, bias=False)
          (1): GELU(approximate='none')
          (2): Linear(in_features=2400, out_features=600, bias=False)
          (3): Dropout(p=0.1, inplace=False)
        )
        (norm1): LayerNorm((600,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((600,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (decoder): Decoder(
    (embed): Embedding(1210, 600)
    (pos_encoding)

In [11]:
batch_size = 1
src = torch.zeros((batch_size, 100),dtype= torch.int64)
src_mask = torch.ones((batch_size, 1, 1, 100), dtype= torch.bool)
tgt = torch.zeros((batch_size, 99),dtype= torch.int64)
tgt_mask = torch.ones((batch_size, 1, 99, 99), dtype= torch.bool)
print(src, src_mask, tgt, tgt_mask)

print(src.shape, src_mask.shape, tgt.shape, tgt_mask.shape)
print(src.dtype, src_mask.dtype, tgt.dtype, tgt_mask.dtype)

torch_out = model(src, tgt, src_mask, tgt_mask)
torch_out.shape
torch_out

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]]) tensor([[[[True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, Tr

tensor([[[-4.0089, -2.4957, -7.8957,  ..., -7.9997, -3.8218,  2.9767],
         [-4.0081, -2.4983, -7.8962,  ..., -7.9999, -3.8231,  2.9747],
         [-4.0067, -2.5006, -7.8970,  ..., -8.0007, -3.8239,  2.9733],
         ...,
         [-4.0073, -2.5028, -7.8980,  ..., -8.0025, -3.8190,  2.9798],
         [-4.0079, -2.5008, -7.8993,  ..., -8.0047, -3.8187,  2.9804],
         [-4.0080, -2.4980, -7.8998,  ..., -8.0056, -3.8176,  2.9818]]],
       grad_fn=<ViewBackward0>)

In [12]:
torch.onnx.export(model,               # model being run
                  (src,tgt,src_mask,tgt_mask),                         # model input (or a tuple for multiple inputs)
                  "model_1.onnx",   # where to save the model (can be a file or file-like object)
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=17,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ['source', 'target', 'source_mask', 'target_mask'],   # the model's input names
                  output_names = ['output'], # the model's output names
                  dynamic_axes={'target' : {1: 'context_length'}, 
                                'target_mask' : {2: 'dim_mask',3: 'context_length'},
                                'output' : {1: 'context_length'}})

In [13]:
onnx_model = onnx.load("model_1.onnx")
onnx.checker.check_model(onnx_model)

In [14]:

ort_session = ort.InferenceSession("model_1.onnx", providers=["CPUExecutionProvider"])

def to_numpy(tensor):
    return tensor.detach().cpu().numpy()if tensor.requires_grad else tensor.cpu().numpy()

# compute ONNX Runtime output prediction
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(src).astype('int64') ,
              ort_session.get_inputs()[1].name: to_numpy(tgt).astype('int64'),
              ort_session.get_inputs()[2].name: to_numpy(src_mask).astype('bool'),
              ort_session.get_inputs()[3].name: to_numpy(tgt_mask).astype('bool')}
ort_outs = ort_session.run(None, ort_inputs)
print(ort_outs[0])

# compare ONNX Runtime and PyTorch results
np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)

print("Exported model has been tested with ONNXRuntime, and the result looks good!")

    

[[[-4.0088654 -2.495728  -7.895707  ... -7.9997063 -3.8217719  2.9766543]
  [-4.0081205 -2.4983144 -7.8962245 ... -7.99987   -3.823112   2.9746547]
  [-4.0067267 -2.50055   -7.8969693 ... -8.000668  -3.8238568  2.973314 ]
  ...
  [-4.007272  -2.5028355 -7.897967  ... -8.002458  -3.8189769  2.9797707]
  [-4.007879  -2.5008245 -7.8992763 ... -8.004711  -3.818676   2.9803545]
  [-4.0080423 -2.4979782 -7.899762  ... -8.005637  -3.8176394  2.9818041]]]
Exported model has been tested with ONNXRuntime, and the result looks good!


In [15]:
quantize_dynamic("model_1.onnx", "model_1_quantized.onnx")



Ignore MatMul due to non constant B: /[/encoder/layers.0/mha/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layers.0/mha/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layers.1/mha/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layers.1/mha/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layers.2/mha/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layers.2/mha/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layers.3/mha/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layers.3/mha/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layers.4/mha/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layers.4/mha/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layers.5/mha/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layers.5/mha/MatMul_1]
Ignore MatMul due to non constant B: /[/decoder/layers.0/mmha/MatMul]
Ignore MatMul due to non constant B: /[/decoder/layers.0/mmha/MatMul_1]
Ignore MatMul due 

In [17]:

ort_session_2 = ort.InferenceSession("model_1_quantized.onnx")
tgt = torch.randint(0,1210,(batch_size, 1),dtype= torch.int64)
tgt_mask = torch.ones((batch_size, 1, 1, 1), dtype= torch.bool)
print(src, src_mask, tgt, tgt_mask)

def to_numpy(tensor):
    return tensor.detach().cpu().numpy()if tensor.requires_grad else tensor.cpu().numpy()

# compute ONNX Runtime output prediction
ort_inputs_2 = {ort_session_2.get_inputs()[0].name: to_numpy(src).astype('int64') ,
              ort_session_2.get_inputs()[1].name: to_numpy(tgt).astype('int64'),
              ort_session_2.get_inputs()[2].name: to_numpy(src_mask).astype('bool'),
              ort_session_2.get_inputs()[3].name: to_numpy(tgt_mask).astype('bool')}
ort_outs = ort_session_2.run(None, ort_inputs_2)
print(ort_outs[0])

# compare ONNX Runtime and PyTorch results
# np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)

print("Exported model has been tested with ONNXRuntime, and the result looks good!")

NoSuchFile: [ONNXRuntimeError] : 3 : NO_SUCHFILE : Load model from model_1_quantized.onnx failed:Load model model_1_quantized.onnx failed. File doesn't exist