In [1]:
# Some standard imports
import numpy as np

from torch import nn
import torch.onnx
from new_transformer import Transformer
import onnx
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

device = torch.device("cpu")
model = Transformer(
    vocab_size=1210,
    n_head=6,
    embed_size=600,
    context_length=100,
    dropout=0.1,
    num_layers=6,
    device=device,
)
model.load_state_dict(torch.load("saved_model_tokenizer_3.pth"))
model.eval()

Transformer(
  (encoder): Encoder(
    (embed): Embedding(1210, 600)
    (pos_encoding): SinusoidEncoding()
    (dropout): Dropout(p=0.1, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x Block(
        (qkv): Linear(in_features=600, out_features=1800, bias=False)
        (mha): MultiHeadAttention(
          (fc_out): Linear(in_features=600, out_features=600, bias=False)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (fc_dropout): Dropout(p=0.1, inplace=False)
        )
        (ffwd): FeedForward(
          (0): Linear(in_features=600, out_features=2400, bias=False)
          (1): GELU(approximate='none')
          (2): Linear(in_features=2400, out_features=600, bias=False)
          (3): Dropout(p=0.1, inplace=False)
        )
        (norm1): LayerNorm((600,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((600,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (decoder): Decoder(
    (embed): Embedding(1210, 600)
    (pos_encoding)

In [3]:
batch_size = 1
src = torch.zeros((batch_size, 100),dtype= torch.int64)
src_mask = torch.ones((batch_size, 1, 1, 100), dtype= torch.bool)
tgt = torch.zeros((batch_size, 100),dtype= torch.int64)
tgt_mask = torch.ones((batch_size, 1, 100, 100), dtype= torch.bool)
print(src, src_mask, tgt, tgt_mask)

print(src.shape, src_mask.shape, tgt.shape, tgt_mask.shape)
print(src.dtype, src_mask.dtype, tgt.dtype, tgt_mask.dtype)

torch_out = model(src, tgt, src_mask, tgt_mask)
torch_out.shape
torch_out

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]]) tensor([[[[True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, Tr

tensor([[[-2.2270, -9.7406, -7.8299,  ..., -2.8887, -3.0761, -3.3137],
         [-2.2272, -9.7407, -7.8299,  ..., -2.8872, -3.0742, -3.3164],
         [-2.2256, -9.7385, -7.8286,  ..., -2.8864, -3.0726, -3.3164],
         ...,
         [-2.2209, -9.7404, -7.8287,  ..., -2.8924, -3.0778, -3.3111],
         [-2.2215, -9.7414, -7.8283,  ..., -2.8932, -3.0797, -3.3067],
         [-2.2239, -9.7431, -7.8289,  ..., -2.8939, -3.0803, -3.3043]]],
       grad_fn=<ViewBackward0>)

In [4]:
torch.onnx.export(model,               # model being run
                  (src,tgt,src_mask,tgt_mask),                         # model input (or a tuple for multiple inputs)
                  "model_1.onnx",   # where to save the model (can be a file or file-like object)
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=17,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ['source', 'target', 'source_mask', 'target_mask'],   # the model's input names
                  output_names = ['output'], # the model's output names
                  dynamic_axes={'target' : {1: 'context_length'}, 
                                'target_mask' : {2: 'dim_mask',3: 'context_length'},
                                'output' : {1: 'context_length'}})

In [5]:
onnx_model = onnx.load("model_1.onnx")
onnx.checker.check_model(onnx_model)

In [6]:

ort_session = ort.InferenceSession("model_1.onnx", providers=["CPUExecutionProvider"])

def to_numpy(tensor):
    return tensor.detach().cpu().numpy()if tensor.requires_grad else tensor.cpu().numpy()

# compute ONNX Runtime output prediction
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(src).astype('int64') ,
              ort_session.get_inputs()[1].name: to_numpy(tgt).astype('int64'),
              ort_session.get_inputs()[2].name: to_numpy(src_mask).astype('bool'),
              ort_session.get_inputs()[3].name: to_numpy(tgt_mask).astype('bool')}
ort_outs = ort_session.run(None, ort_inputs)
print(ort_outs[0])

# compare ONNX Runtime and PyTorch results
np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)

print("Exported model has been tested with ONNXRuntime, and the result looks good!")

    

[[[-2.2269676 -9.740591  -7.829893  ... -2.8886979 -3.0761008 -3.3136947]
  [-2.2272356 -9.740738  -7.8298564 ... -2.88724   -3.0742009 -3.3163526]
  [-2.225567  -9.738461  -7.8286386 ... -2.8863773 -3.0725954 -3.3164423]
  ...
  [-2.2209232 -9.74035   -7.828696  ... -2.892378  -3.0777695 -3.3110888]
  [-2.2214565 -9.741371  -7.82825   ... -2.8931556 -3.0797226 -3.3066583]
  [-2.223932  -9.743134  -7.8289456 ... -2.8939233 -3.0802665 -3.3042738]]]
Exported model has been tested with ONNXRuntime, and the result looks good!


In [7]:
quantize_dynamic("model_1.onnx", "model_1_quantized.onnx")



Ignore MatMul due to non constant B: /[/encoder/layers.0/mha/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layers.0/mha/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layers.1/mha/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layers.1/mha/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layers.2/mha/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layers.2/mha/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layers.3/mha/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layers.3/mha/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layers.4/mha/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layers.4/mha/MatMul_1]
Ignore MatMul due to non constant B: /[/encoder/layers.5/mha/MatMul]
Ignore MatMul due to non constant B: /[/encoder/layers.5/mha/MatMul_1]
Ignore MatMul due to non constant B: /[/decoder/layers.0/mmha/MatMul]
Ignore MatMul due to non constant B: /[/decoder/layers.0/mmha/MatMul_1]
Ignore MatMul due 

In [9]:

ort_session_2 = ort.InferenceSession("model_1_quantized.onnx")
tgt = torch.randint(0,1210,(batch_size, 100),dtype= torch.int64)
tgt_mask = torch.ones((batch_size, 1, 100, 100), dtype= torch.bool)
print(src, src_mask, tgt, tgt_mask)

def to_numpy(tensor):
    return tensor.detach().cpu().numpy()if tensor.requires_grad else tensor.cpu().numpy()

# compute ONNX Runtime output prediction
ort_inputs_2 = {ort_session_2.get_inputs()[0].name: to_numpy(src).astype('int64') ,
              ort_session_2.get_inputs()[1].name: to_numpy(tgt).astype('int64'),
              ort_session_2.get_inputs()[2].name: to_numpy(src_mask).astype('bool'),
              ort_session_2.get_inputs()[3].name: to_numpy(tgt_mask).astype('bool')}
ort_outs = ort_session_2.run(None, ort_inputs_2)
print(ort_outs[0])

# compare ONNX Runtime and PyTorch results
np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)

print("Exported model has been tested with ONNXRuntime, and the result looks good!")

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]]) tensor([[[[True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, Tr

AssertionError: 
Not equal to tolerance rtol=0.001, atol=1e-05

Mismatched elements: 120186 / 121000 (99.3%)
Max absolute difference: 3.167286
Max relative difference: 23548.799
 x: array([[[-2.226967, -9.740591, -7.829891, ..., -2.888696, -3.076103,
         -3.313692],
        [-2.227233, -9.740733, -7.829851, ..., -2.887239, -3.074201,...
 y: array([[[ -2.270726, -10.39834 ,  -8.164363, ...,  -2.750236,
          -3.031729,  -2.566596],
        [ -2.350307, -10.018869,  -8.134374, ...,  -2.591075,...