# Welcome To the ONNX Runtime Tutorial!

This notebook is designed to demonstrate the features of the ONNXRT passes integrated into MASE as part of the MASERT framework.

## Section 1. INT8 Quantization
Firstly, we will show you how to do an int8 quantization of a simple model, `jsc-toy`, and compare the quantized model to the original model using the `Machop API`. The quantization process is split into the following stages, each using their own individual pass, and are explained in depth at each subsection:



In [1]:
import sys
import os
from pathlib import Path
import toml
from copy import copy, deepcopy

# Figure out the correct path
machop_path = Path(".").resolve().parent.parent.parent /"machop"
assert machop_path.exists(), "Failed to find machop at: {}".format(machop_path)
sys.path.append(str(machop_path))

# Add directory to the PATH so that chop can be called
new_path = "../../../machop"
full_path = os.path.abspath(new_path)
os.environ['PATH'] += os.pathsep + full_path

from chop.tools.utils import to_numpy_if_tensor
from chop.tools.logger import set_logging_verbosity
from chop.tools import get_cf_args, get_dummy_input
from chop.passes.graph.utils import deepcopy_mase_graph
from chop.tools.get_input import InputGenerator
from chop.tools.checkpoint_load import load_model
from chop.ir import MaseGraph
from chop.models import get_model_info, get_model, get_tokenizer
from chop.dataset import MaseDataModule, get_dataset_info
from chop.passes.graph.transforms import metadata_value_type_cast_transform_pass
from chop.passes.graph import (
    summarize_quantization_analysis_pass,
    add_common_metadata_analysis_pass,
    init_metadata_analysis_pass,
    add_software_metadata_analysis_pass,
    onnx_runtime_transform_pass,
    runtime_analysis_pass,
    )

set_logging_verbosity("info")

  from .autonotebook import tqdm as notebook_tqdm


[2024-03-20 11:53:05,861] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[32mINFO    [0m [34mSet logging level to info[0m
I0320 11:53:07.422378 139711941265216 logger.py:44] Set logging level to info


In [2]:
# Path to your TOML file
# toml_file_path = '../../../machop/configs/tensorrt/vgg7_layerwise_mixed_precision.toml'
# toml_file_path = '../../../machop/configs/tensorrt/vgg7_typewise_mixed_precision.toml'
toml_file_path = "../../../machop/configs/onnx/jsc_pl2ort.toml"

# Reading TOML file and converting it into a Python dictionary
with open(toml_file_path, 'r') as toml_file:
    pass_args = toml.load(toml_file)

# Extract the 'passes.tensorrt' section and its children
onnx_config = pass_args.get('passes', {}).get('onnxruntime', {})
# Extract the 'passes.runtime_analysis' section and its children
runtime_analysis_config = pass_args.get('passes', {}).get('runtime_analysis', {})

# Load the basics in
model_name = pass_args['model']
dataset_name = pass_args['dataset']
max_epochs = pass_args['max_epochs']
batch_size = pass_args['batch_size']
learning_rate = pass_args['learning_rate']
accelerator = pass_args['accelerator']

data_module = MaseDataModule(
    name=dataset_name,
    batch_size=batch_size,
    model_name=model_name,
    num_workers=0,
)
data_module.prepare_data()
data_module.setup()

# Add the data_module and other necessary information to the configs
configs = [onnx_config, runtime_analysis_config]
for config in configs:
    config['task'] = pass_args['task']
    config['batch_size'] = pass_args['batch_size']
    config['model'] = pass_args['model']
    config['data_module'] = data_module
    config['accelerator'] = 'cuda' if pass_args['accelerator'] == 'gpu' else pass_args['accelerator']
    if config['accelerator'] == 'gpu':
        os.environ['CUDA_MODULE_LOADING'] = 'LAZY'

model_info = get_model_info(model_name)
model = get_model(
    model_name,
    task="cls",
    dataset_info=data_module.dataset_info,
    pretrained=False)

input_generator = InputGenerator(
    data_module=data_module,
    model_info=model_info,
    task="cls",
    which_dataloader="train",
)

# generate the mase graph and initialize node metadata
mg = MaseGraph(model=model)

In [3]:
# # Load in the trained checkpoint - change this accordingly
# VGG_CHECKPOINT_PATH = "../../../mase_output/jsc-toy_classification_jsc_2024-03-17/software/training_ckpts/best.ckpt"

# model = load_model(load_name=VGG_CHECKPOINT_PATH, load_type="pl", model=model)

# Initiate metadata
dummy_in = next(iter(input_generator))
_ = model(**dummy_in)
mg, _ = init_metadata_analysis_pass(mg, None)

mg_original = deepcopy_mase_graph(mg)

mg, _ = add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in})
mg, _ = add_software_metadata_analysis_pass(mg, None)
mg, _ = metadata_value_type_cast_transform_pass(mg, pass_args={"fn": to_numpy_if_tensor})

In [4]:
mg, onnx_meta = onnx_runtime_transform_pass(mg, pass_args=onnx_config)

[32mINFO    [0m [34mConverting PyTorch model to ONNX...[0m
I0320 11:53:11.919002 139711941265216 onnx_runtime.py:51] Converting PyTorch model to ONNX...
[32mINFO    [0m [34mONNX Conversion Complete. Stored ONNX model to /root/mase/mase_output/onnx_runtime/unquantized/2024_03_20/version_13/model.onnx[0m
I0320 11:53:12.055048 139711941265216 onnx_runtime.py:69] ONNX Conversion Complete. Stored ONNX model to /root/mase/mase_output/onnx_runtime/unquantized/2024_03_20/version_13/model.onnx
[32mINFO    [0m [34mONNX Model Summary: 
+-------+----------------------------------+--------------------+--------------------------------------------------------------------------------------------------------------------------+-------------------------------------------+---------------------+
| Index |               Name               |        Type        |                                                          Inputs                                                          |               

W0320 11:53:38.699918 139711941265216 quantize.py:540] Please consider pre-processing before quantization. See https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification/cpu/ReadMe.md 
[32mINFO    [0m [34mQuantization complete. Model is now dynamically quantized.[0m
I0320 11:53:38.700979 139711941265216 quantize.py:59] Quantization complete. Model is now dynamically quantized.


In [None]:
mg, _ = runtime_analysis_pass(mg, pass_args=runtime_analysis_config)

[32mINFO    [0m [34mStarting transformation analysis on jsc-toy[0m
I0320 09:13:54.422588 139697046427456 analysis.py:233] Starting transformation analysis on jsc-toy
[32mINFO    [0m [34m
Results jsc-toy:
+------------------------------+---------------+
|            Metric            |     Value     |
+------------------------------+---------------+
|    Average Test Accuracy     |    0.15471    |
|      Average Precision       |    0.14226    |
|        Average Recall        |    0.16118    |
|       Average F1 Score       |    0.14833    |
|         Average Loss         |    1.8961     |
|       Average Latency        |   1.4385 ms   |
|   Average GPU Power Usage    |   16.857 W    |
| Inference Energy Consumption | 0.0067358 mWh |
+------------------------------+---------------+[0m
I0320 09:13:55.835249 139697046427456 analysis.py:347] 
Results jsc-toy:
+------------------------------+---------------+
|            Metric            |     Value     |
+-------------------------

In [6]:
mg, _ = runtime_analysis_pass(onnx_meta['onnx_dynamic_quantized_path'], pass_args=runtime_analysis_config)

[32mINFO    [0m [34mStarting transformation analysis on jsc-toy-onnx[0m
I0320 09:13:56.434021 139697046427456 analysis.py:233] Starting transformation analysis on jsc-toy-onnx


[32mINFO    [0m [34m
Results jsc-toy-onnx:
+------------------------------+---------------+
|            Metric            |     Value     |
+------------------------------+---------------+
|    Average Test Accuracy     |    0.20404    |
|      Average Precision       |   0.042026    |
|        Average Recall        |    0.20461    |
|       Average F1 Score       |   0.069729    |
|         Average Loss         |    1.6203     |
|       Average Latency        |  0.36442 ms   |
|   Average GPU Power Usage    |   21.985 W    |
| Inference Energy Consumption | 0.0022255 mWh |
+------------------------------+---------------+[0m
I0320 09:13:57.393121 139697046427456 analysis.py:347] 
Results jsc-toy-onnx:
+------------------------------+---------------+
|            Metric            |     Value     |
+------------------------------+---------------+
|    Average Test Accuracy     |    0.20404    |
|      Average Precision       |   0.042026    |
|        Average Recall        |    0.2

In [5]:
mg, _ = runtime_analysis_pass(onnx_meta['onnx_static_quantized_path'], pass_args=runtime_analysis_config)

[32mINFO    [0m [34mStarting transformation analysis on jsc-toy-onnx[0m
I0320 11:54:09.398242 139711941265216 analysis.py:224] Starting transformation analysis on jsc-toy-onnx
[32mINFO    [0m [34m
Results jsc-toy-onnx:
+------------------------------+----------------+
|            Metric            |     Value      |
+------------------------------+----------------+
|    Average Test Accuracy     |    0.16166     |
|      Average Precision       |    0.064334    |
|        Average Recall        |    0.15987     |
|       Average F1 Score       |    0.090376    |
|         Average Loss         |     1.6116     |
|       Average Latency        |   0.18648 ms   |
|   Average GPU Power Usage    |    12.917 W    |
| Inference Energy Consumption | 0.00066911 mWh |
+------------------------------+----------------+[0m
I0320 11:54:10.146393 139711941265216 analysis.py:340] 
Results jsc-toy-onnx:
+------------------------------+----------------+
|            Metric            |     Value