#### ONNX Test Conversion and ONNXRuntime Test Run

In [3]:
# Create and export the model to ONNX format
!pip install onnx
!pip install onnxscript
import torch
import torch.nn as nn
import torch.nn.functional as F


# Squeeze-and-Excite Module
class SqueezeExcite(nn.Module):
    def __init__(self, input_channels, squeeze_factor=4):
        super(SqueezeExcite, self).__init__()
        squeeze_channels = input_channels // squeeze_factor
        self.fc1 = nn.Conv2d(input_channels, squeeze_channels, 1)
        self.fc2 = nn.Conv2d(squeeze_channels, input_channels, 1)

    def forward(self, x):
        scale = F.adaptive_avg_pool2d(x, 1)
        scale = F.relu(self.fc1(scale))
        scale = torch.sigmoid(self.fc2(scale))
        return x * scale

# Inverted Residual Block
class InvertedResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, expansion_factor, stride, use_se):
        super(InvertedResidualBlock, self).__init__()
        hidden_dim = in_channels * expansion_factor
        self.use_residual = stride == 1 and in_channels == out_channels

        self.expand = nn.Conv2d(in_channels, hidden_dim, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(hidden_dim)
        self.depthwise = nn.Conv2d(hidden_dim, hidden_dim, 3, stride=stride, padding=1, groups=hidden_dim, bias=False)
        self.bn2 = nn.BatchNorm2d(hidden_dim)
        self.se = SqueezeExcite(hidden_dim) if use_se else nn.Identity()
        self.project = nn.Conv2d(hidden_dim, out_channels, 1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        out = F.relu(self.bn1(self.expand(x)))
        out = F.relu(self.bn2(self.depthwise(out)))
        out = self.se(out)
        out = self.bn3(self.project(out))
        if self.use_residual:
            out = out + x
        return out

# MobileNetV3 Backbone
class MyModel(nn.Module):
    def __init__(self, num_classes=10):
        super(MyModel, self).__init__()
        self.stem = nn.Sequential(
            nn.Conv2d(1, 16, 3, stride=2, padding=1, bias=False),  # Initial stem conv layer
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True)
        )

        self.layers = nn.Sequential(
            InvertedResidualBlock(16, 16, expansion_factor=1, stride=1, use_se=False),
            InvertedResidualBlock(16, 24, expansion_factor=4, stride=2, use_se=False),
            InvertedResidualBlock(24, 24, expansion_factor=4, stride=1, use_se=False),
            InvertedResidualBlock(24, 40, expansion_factor=4, stride=2, use_se=True),
            InvertedResidualBlock(40, 40, expansion_factor=4, stride=1, use_se=True),
            InvertedResidualBlock(40, 80, expansion_factor=4, stride=2, use_se=False),
            InvertedResidualBlock(80, 80, expansion_factor=4, stride=1, use_se=False),
            InvertedResidualBlock(80, 112, expansion_factor=6, stride=1, use_se=True),
            InvertedResidualBlock(112, 160, expansion_factor=6, stride=2, use_se=True)
        )

        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(160, 1280, 1),
            nn.ReLU(inplace=True),
            nn.Flatten(),
            nn.Linear(1280, num_classes)
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.layers(x)
        x = self.classifier(x)
        return x

# torch_model = MyModel()
# torch_input = torch.randn(1, 1, 32, 32)
# onnx_program = torch.onnx.dynamo_export(torch_model, torch_input) # Export from torch to onnx model format



In [5]:
# Save the ONNX model in a file

# Define model and export the Torch model to onnx format
torch_model = MyModel()
torch_model.eval() # Set the model to evaluation mode
torch_input = torch.randn(1, 1, 32, 32)
onnx_program = torch.onnx.dynamo_export(torch_model, torch_input)
onnx_program.save("my_image_classifier.onnx")

# Import library and load from ONNX side
!pip install onnx
!pip install onnxscript
import onnx
onnx_model = onnx.load("my_image_classifier.onnx")
onnx.checker.check_model(onnx_model)

  new_node = self.module.graph.get_attr(normalized_name)




In [6]:
# Execute the ONNX model with ONNX Runtime
# !pip install onnxruntime
!pip install onnxruntime-gpu
import onnxruntime
onnx_input = onnx_program.adapt_torch_inputs_to_onnx(torch_input)
print(f"Input length: {len(onnx_input)}")
print(f"Sample input: {onnx_input}")

ort_session = onnxruntime.InferenceSession("./my_image_classifier.onnx", providers=['CPUExecutionProvider'])

def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)}
onnxruntime_outputs = ort_session.run(None, onnxruntime_input)


# Compare the PyTorch results with the ones from the ONNX Runtime
torch_outputs = torch_model(torch_input)
torch_outputs = onnx_program.adapt_torch_outputs_to_onnx(torch_outputs)

assert len(torch_outputs) == len(onnxruntime_outputs)
for torch_output, onnxruntime_output in zip(torch_outputs, onnxruntime_outputs):
    torch.testing.assert_close(torch_output, torch.tensor(onnxruntime_output))

print("\n\nPyTorch and ONNX Runtime output matched!")
print(f"\n\nPyTorch Output length: {len(torch_output)}")
print(f"PyTorch Sample Output: {torch_output}")
print(f"\n\nONNXRuntime Output length: {len(onnxruntime_outputs)}")
print(f"ONNXRuntime Sample output: {onnxruntime_outputs}")

Collecting onnxruntime-gpu
  Downloading onnxruntime_gpu-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime-gpu)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime-gpu)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime_gpu-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (226.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.2/226.2 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hIns

----
#### Model Runtime Test with 1000 Samples Iteration (Non-opt FP32, Opt FP32, Opt FP16 [Half-Floating])

In [7]:
input_shape = (1, 1, 32, 32)
input_data_for_inference = torch.randn(input_shape, dtype=torch.float32).cuda()
input_data_for_inference_fp16 = input_data_for_inference.to(dtype=torch.float16)

In [8]:
# @title Non-Optimized FP32 (Full Tensor)
import time

input_shape = (1, 1, 32, 32)
output_shape = (1, 10)

#>>>> batch runs session
print("Non-optimizing FP32 bench testing...")
#---Non-optimized---
nonopt_model = MyModel().cuda().eval()
num_iterations = 10000
total_time = 0.0
with torch.no_grad():
    for i in range(num_iterations):
        start_time = time.time()
        input_data = torch.randn(input_shape).cuda()
        output_data = nonopt_model(input_data)
        end_time = time.time()
        total_time += end_time - start_time
pytorch_fps = num_iterations / total_time
print(f"PyTorch FPS: {pytorch_fps:.2f}")



#---------Inference with FP32--------------
import onnxruntime as ort
# Load the ONNX model and run inference
# session = ort.InferenceSession('my_image_classifier_fp16.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
# onnx_input = onnx_program.adapt_torch_inputs_to_onnx(torch_input)
# onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)}

output_data = nonopt_model(input_data_for_inference)

print(f"\n\nONNXRuntime [FP32] Output length: {len(output_data)}")
print(f"ONNXRuntime [FP32] Sample output: {output_data}")
print(f"ONNXRuntime [FP32] Sample output type: {output_data.dtype}")

Non-optimizing FP32 bench testing...
PyTorch FPS: 216.49


ONNXRuntime [FP32] Output length: 1
ONNXRuntime [FP32] Sample output: tensor([[-0.0054,  0.0065, -0.0290,  0.0145, -0.0296, -0.0340,  0.0021,  0.0024,
          0.0354, -0.0008]], device='cuda:0', grad_fn=<AddmmBackward0>)
ONNXRuntime [FP32] Sample output type: torch.float32


In [9]:
# @title Optimized FP32 (Full Tensor)
import os
os.environ["ALLOW_RELEASED_ONNX_OPSET_ONLY"] = "0"
import onnxruntime.backend as backend

print("Optimized model FP32 bench testing...")
# Create a engine from the ONNX model and measure inference speed
model_onnx = onnx.load('./my_image_classifier.onnx')
onnx_engine = backend.prepare(model_onnx, device='GPU')
num_iterations = 10000
total_time_fp32 = 0.0
print("start inferencing...FP32")
with torch.no_grad():
    for i in range(num_iterations):
        input_data = torch.randn(input_shape).cuda()
        start_time = time.time()
        output_data = onnx_engine.run(input_data.cpu().numpy())[0]
        end_time = time.time()
        total_time_fp32 += end_time - start_time
tensor_fps_fp32 = num_iterations /total_time_fp32
#tensor_fps = num_iterations / total_time
print(f"Tensor FPS [FP32]: {tensor_fps_fp32:.2f}")
print(f"Speedup: {tensor_fps_fp32/pytorch_fps:.2f}x")


#---------Inference with FP16--------------
import onnxruntime as ort
# Load the ONNX model and run inference
# session = ort.InferenceSession('my_image_classifier_fp16.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
# onnx_input = onnx_program.adapt_torch_inputs_to_onnx(torch_input)
# onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)}

output_data = onnx_engine.run(input_data_for_inference.cpu().numpy())[0]

print(f"\n\nONNXRuntime [FP32] Output length: {len(output_data)}")
print(f"ONNXRuntime [FP32] Sample output: {output_data}")
print(f"ONNXRuntime [FP32] Sample output type: {output_data.dtype}")

Optimized model FP32 bench testing...
*************** EP Error ***************
EP Error /onnxruntime_src/onnxruntime/python/onnxruntime_pybind_state.cc:490 void onnxruntime::python::RegisterTensorRTPluginsAsCustomOps(PySessionOptions&, const onnxruntime::ProviderOptions&) Please install TensorRT libraries as mentioned in the GPU requirements page, make sure they're in the PATH or LD_LIBRARY_PATH, and that your GPU is supported.
 when using ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
Falling back to ['CUDAExecutionProvider', 'CPUExecutionProvider'] and retrying.
****************************************
start inferencing...FP32
Tensor FPS [FP32]: 373.14
Speedup: 1.72x


ONNXRuntime [FP32] Output length: 1
ONNXRuntime [FP32] Sample output: [[-0.03523889  0.02945375  0.00534764  0.02429228  0.03596256 -0.00773814
   0.05146693  0.00391703 -0.00663549 -0.05681043]]
ONNXRuntime [FP32] Sample output type: float32


In [10]:
# @title Optimized FP16 (Half Tensor)
print("Optimized model FP16 bench testing...")

# Try using CUDAExecutionProvider and check if it's available
# ort_session = onnxruntime.InferenceSession("./my_image_classifier.onnx", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])

torch_model_fp16 = torch_model.half()
dummy_input_fp16 = torch.randn(input_shape, dtype=torch.float16)
input_names = ['input']
output_names = ['output']
torch.onnx.export(torch_model_fp16, dummy_input_fp16, './my_image_classifier_fp16.onnx', verbose=False, input_names=input_names, output_names=output_names)

#onnx_model_path_fp16 = f'outputs/{opt["compressed_directory"]}/compressed_student_net_fp16.onnx'
#session_fp16 = ort.InferenceSession(onnx_model_path_fp16, providers=providers)

# Create a engine from the ONNX model and measure inference speed
model_onnx_fp16 = onnx.load('./my_image_classifier_fp16.onnx')
onnx_engine_fp16 = backend.prepare(model_onnx_fp16, device='GPU', provider='CUDAExecutionProvider', float16=True)

num_iterations = 10000
total_time_fp16 = 0.0
print("start inferencing...FP16")
with torch.no_grad():
    for i in range(num_iterations):
        input_data = torch.randn(input_shape, dtype=torch.float16).cuda()
        start_time = time.time()
        output_data = onnx_engine_fp16.run(input_data.cpu().numpy())[0]
        end_time = time.time()
        total_time_fp16 += end_time - start_time
tensor_fps_fp16 = num_iterations /total_time_fp16
#tensor_fps = num_iterations / total_time
print(f"Tensor FPS [FP16]: {tensor_fps_fp16:.2f}")
print(f"Speedup: {tensor_fps_fp16/pytorch_fps:.2f}x")





#---------Inference with FP16--------------
import onnxruntime as ort
# Load the ONNX model and run inference
# session = ort.InferenceSession('my_image_classifier_fp16.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
# onnx_input = onnx_program.adapt_torch_inputs_to_onnx(torch_input)
# onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)}
# input_data = torch.randn(input_shape, dtype=torch.float16).cuda()
output_data = onnx_engine_fp16.run(input_data_for_inference_fp16.cpu().numpy())[0]

print(f"\n\nONNXRuntime [FP16] Output length: {len(output_data)}")
print(f"ONNXRuntime [FP16] Sample output: {output_data}")
print(f"ONNXRuntime [FP16] Sample output type: {output_data.dtype}")

Optimized model FP16 bench testing...
*************** EP Error ***************
EP Error /onnxruntime_src/onnxruntime/python/onnxruntime_pybind_state.cc:490 void onnxruntime::python::RegisterTensorRTPluginsAsCustomOps(PySessionOptions&, const onnxruntime::ProviderOptions&) Please install TensorRT libraries as mentioned in the GPU requirements page, make sure they're in the PATH or LD_LIBRARY_PATH, and that your GPU is supported.
 when using ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
Falling back to ['CUDAExecutionProvider', 'CPUExecutionProvider'] and retrying.
****************************************
start inferencing...FP16
Tensor FPS [FP16]: 413.97
Speedup: 1.91x


ONNXRuntime [FP16] Output length: 1
ONNXRuntime [FP16] Sample output: [[-0.03525   0.02946   0.005344  0.02429   0.03595  -0.007744  0.05145
   0.003918 -0.006638 -0.05682 ]]
ONNXRuntime [FP16] Sample output type: float16


In [11]:
# @title Optimized and Compressed (Quantization) to support INT8
import torch
import torch.quantization

#------------Quantize only in torch package---------------
"""
# Load your model
torch_model = MyModel()  # Your model loading code here

# Switch the model to evaluation mode
torch_model.eval()

# Fuse layers if necessary
model_int8 = torch.quantization.fuse_modules(torch_model, [['conv1', 'relu1'],
                                                           ['conv2', 'relu2']])

# Apply quantization transformations
model_int8.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(model_int8, inplace=True)


# Convert the model to a quantized version
torch.quantization.convert(model_int8, inplace=True)

# Save the quantized model
torch.save(model_int8.state_dict(), 'quantized_model_int8.pth')
"""
#---------Quantize and convert to ONNX format--------------
import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

# Load the full precision ONNX model
# model_fp32 = onnx.load('full_precision_model.onnx')

# Apply dynamic quantization to INT8
model_int8 = quantize_dynamic(
    'my_image_classifier.onnx',
    'my_quantized_classifier_int8.onnx',
    weight_type=QuantType.QInt8  # You can use QuantType.QUInt8 for unsigned INT8
)
#---------------------------------------------------------


















# Create a engine from the ONNX model and measure inference speed
model_onnx_int8 = onnx.load('./my_quantized_classifier_int8.onnx')
onnx_engine_int8 = backend.prepare(model_onnx_int8, device='GPU', provider='CUDAExecutionProvider', float16=True)

# Run a calibration step to collect statistics
num_iterations = 10000
total_time_int8 = 0.0
with torch.no_grad():
    for i in range(num_iterations):
        input_data = torch.randn(input_shape, dtype=torch.float32).cuda()
        start_time = time.time()
        output_data = onnx_engine_int8.run(input_data.cpu().numpy())[0]
        end_time = time.time()
        total_time_int8 += end_time - start_time
tensor_fps_int8 = num_iterations /total_time_int8
#tensor_fps = num_iterations / total_time
print(f"Tensor FPS [INT8]: {tensor_fps_int8:.2f}")
print(f"Speedup: {tensor_fps_int8/pytorch_fps:.2f}x")

#---------Inference with INT8--------------
import onnxruntime as ort
# Load the ONNX model

# Load the ONNX model and run inference
# session = ort.InferenceSession('my_quantized_classifier_int8.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
# input_data = torch.randn(input_shape, dtype=torch.float32).cpu().numpy()
# output_quan_oxrun = session.run(None, {'l_x_': onnxruntime_input})[0]
# onnx_input = onnx_program.adapt_torch_inputs_to_onnx(torch_input)
# onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)}
# output_quan_oxrun = session.run(None, onnxruntime_input)

# input_data = torch.randn(input_shape, dtype=torch.float32).cuda()
output_quan_oxrun = onnx_engine_int8.run(input_data_for_inference.cpu().numpy())[0]

print(f"\n\nONNXRuntime [INT8] Output length: {len(output_quan_oxrun)}")
print(f"ONNXRuntime [INT8] Sample output: {output_quan_oxrun}")
print(f"ONNXRuntime [INT8] Sample output type: {output_quan_oxrun.dtype}")



*************** EP Error ***************
EP Error /onnxruntime_src/onnxruntime/python/onnxruntime_pybind_state.cc:490 void onnxruntime::python::RegisterTensorRTPluginsAsCustomOps(PySessionOptions&, const onnxruntime::ProviderOptions&) Please install TensorRT libraries as mentioned in the GPU requirements page, make sure they're in the PATH or LD_LIBRARY_PATH, and that your GPU is supported.
 when using ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
Falling back to ['CUDAExecutionProvider', 'CPUExecutionProvider'] and retrying.
****************************************
Tensor FPS [INT8]: 406.70
Speedup: 1.88x


ONNXRuntime [INT8] Output length: 1
ONNXRuntime [INT8] Sample output: [[-0.03523889  0.02945375  0.00534764  0.02429228  0.03596256 -0.00773814
   0.05146693  0.00391703 -0.00663549 -0.05681043]]
ONNXRuntime [INT8] Sample output type: float32


# Homework
ใช้ MobileNetV3 backbone แปลงโมเดลจาก Torch file to ONNX format และเร่งการทำงานโดยใช้ ONNXruntime สำหรับการอนุมานด้วยข้อมูลความละเอียดแบบ FP32, FP16, และ INT8\
เปรียบเทียบ FPS และ Speedup ก่อนและหลังใช้ ONNXruntime อนุมาน