### initialize model and dataset and mase graph


In [2]:
import sys
import logging
import os
from pathlib import Path
from pprint import pprint as pp

# # figure out the correct path
machop_path = Path(".").resolve().parent.parent /"machop"
assert machop_path.exists(), "Failed to find machop at: {}".format(machop_path)
sys.path.append(str(machop_path))

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch_tensorrt

from torch.utils.tensorboard import SummaryWriter

import pytorch_quantization
from pytorch_quantization import nn as quant_nn
from pytorch_quantization import quant_modules
from pytorch_quantization.tensor_quant import QuantDescriptor
from pytorch_quantization import calib
from tqdm import tqdm

print(pytorch_quantization.__version__)

from chop.dataset import MaseDataModule, get_dataset_info
from chop.tools.logger import set_logging_verbosity

from chop.tools import get_cf_args, get_dummy_input, load_config
from chop.passes.graph import (
    save_node_meta_param_interface_pass,
    report_node_meta_param_analysis_pass,
    profile_statistics_analysis_pass,
    add_common_metadata_analysis_pass,
    init_metadata_analysis_pass,
    add_software_metadata_analysis_pass,
    quantize_tensorrt_transform_pass,
    test_quantize_tensorrt_transform_pass,
    quantization_aware_training_pass,
    graph_calibration_pass,
    evaluate_pytorch_model_pass,
    graph_to_trt_pass,
    mixed_precision_transform_pass,
    test_trt_engine
)
from chop.tools.get_input import InputGenerator
from chop.tools.checkpoint_load import load_model
from chop.ir import MaseGraph

from chop.models import get_model_info, get_model, get_tokenizer

set_logging_verbosity("info")


2.2.1


[32mINFO    [0m [34mSet logging level to info[0m
I0329 01:21:51.331937 140654131185472 logger.py:44] Set logging level to info


In [3]:
batch_size = 16
model_name = "vgg7"
dataset_name = "cifar10"

# batch_size = 1
# model_name = "facebook/opt-125m:patched"
# dataset_name = "cola"


data_module = MaseDataModule(
    name=dataset_name,
    batch_size=batch_size,
    model_name=model_name,
    num_workers=0,
)
data_module.prepare_data()
data_module.setup()

# 📝️ change this CHECKPOINT_PATH to the one you trained in Lab1
CHECKPOINT_PATH = "/home/qizhu/Desktop/Work/mase/mase_output/test-accu-0.9332.ckpt"
# CHECKPOINT_PATH = "/home/qizhu/Desktop/Work/mase/mase_output/opt125.ckpt"

model_info = get_model_info(model_name)
# quant_modules.initialize()
model = get_model(
    model_name,
    task="cls",
    dataset_info=data_module.dataset_info,
    pretrained=False)

model = load_model(load_name=CHECKPOINT_PATH, load_type="pl", model=model)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


[32mINFO    [0m [34mLoaded pytorch lightning checkpoint from /home/qizhu/Desktop/Work/mase/mase_output/test-accu-0.9332.ckpt[0m
I0329 01:21:55.566599 140654131185472 checkpoint_load.py:85] Loaded pytorch lightning checkpoint from /home/qizhu/Desktop/Work/mase/mase_output/test-accu-0.9332.ckpt


In [5]:
#transfer the model into mase graph
mg = MaseGraph(model=model)
ori_mg = MaseGraph(model=model)

# get the input generator
input_generator = InputGenerator(
    data_module=data_module,
    model_info=model_info,
    task="cls",
    which_dataloader="train",
)

# a demonstration of how to feed an input value to the model
dummy_in = next(iter(input_generator))
# _ = model(**dummy_in)

#add all the parameters to masegraph
mg, _ = init_metadata_analysis_pass(mg, None)
mg, _ = add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in})
mg, _ = add_software_metadata_analysis_pass(mg, None)

### 

### Quantization pass based on pytorch-quantization

To achieve QAT, we could use fake quantization to simulate the quantization process. We could use the `FakeQuantize` argument to select it.

Here's an example of pass_args, we use name to specify the layer we want to quantize, achieving layer-wise PTQ, and we could select the specific calibrate method and precision.

In [6]:
## fake quantize the graph
pass_args_mixed_precision = {
    "by": "name",
    "default": {"config": {"name": None}},
    "feature_layers_0": {
        "config": {
            "FakeQuantize": True,
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    },   
    "feature_layers_3": {
        "config": {
            "FakeQuantize": True,
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    },  
    "feature_layers_7": {
        "config": {
            "FakeQuantize": True,
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    }, 
    "feature_layers_10": {
        "config": {
            "FakeQuantize": True,
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    }, 
    "feature_layers_14": {
        "config": {
            "FakeQuantize": True,
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    }, 
    "feature_layers_17": {
        "config": {
            "FakeQuantize": True,
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    }, 
    "classifier_0": {
        "config": {
            "FakeQuantize": True,
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    },
    # "classifier_1": {
    #     "config": {
    #         "FakeQuantize": True,
    #         "name": "int",
    #         "input": {
    #             "precesion": 8,
    #             "calibrator": "histogram",
    #             "quantize_axis": None,
    #         },
    #         "weight": {
    #             "calibrator": "max",
    #             "quantize_axis": None,
    #         },
    #     }
    # },
    # "classifier_2": {
    #     "config": {
    #         "FakeQuantize": True,
    #         "name": "int",
    #         "input": {
    #             "precesion": 8,
    #             "calibrator": "max",
    #             "quantize_axis": None,
    #         },
    #         "weight": {
    #             "calibrator": "max",
    #             "quantize_axis": None,
    #         },
    #     }
    # },
    # "classifier_3": {
    #     "config": {
    #         "FakeQuantize": True,
    #         "name": "int",
    #         "input": {
    #             "precesion": 8,
    #             "calibrator": "max",
    #             "quantize_axis": None,
    #         },
    #         "weight": {
    #             "calibrator": "max",
    #             "quantize_axis": None,
    #         },
    #     }
    # },
}

pass_args_calibrate = {
    "calibrator": "",
    "percentiles": [99],
    "data_module": data_module,
    "num_batches": 100,
}


mg = mixed_precision_transform_pass(mg, pass_args_mixed_precision, pass_args_calibrate)

W0329 01:22:13.052164 140654131185472 tensor_quantizer.py:184] Disable MaxCalibrator
W0329 01:22:13.052608 140654131185472 tensor_quantizer.py:184] Disable MaxCalibrator
W0329 01:22:13.053060 140654131185472 tensor_quantizer.py:184] Disable MaxCalibrator
W0329 01:22:13.053664 140654131185472 tensor_quantizer.py:184] Disable MaxCalibrator
W0329 01:22:13.054681 140654131185472 tensor_quantizer.py:184] Disable MaxCalibrator
W0329 01:22:13.055213 140654131185472 tensor_quantizer.py:184] Disable MaxCalibrator
W0329 01:22:13.055827 140654131185472 tensor_quantizer.py:184] Disable MaxCalibrator
W0329 01:22:13.056388 140654131185472 tensor_quantizer.py:184] Disable MaxCalibrator
W0329 01:22:13.056849 140654131185472 tensor_quantizer.py:184] Disable MaxCalibrator
W0329 01:22:13.057899 140654131185472 tensor_quantizer.py:184] Disable MaxCalibrator
W0329 01:22:13.058561 140654131185472 tensor_quantizer.py:184] Disable MaxCalibrator
W0329 01:22:13.059053 140654131185472 tensor_quantizer.py:184] Di

feature_layers.0._input_quantizer       : TensorQuantizer(8bit fake per-tensor amax=4.4919 calibrator=MaxCalibrator scale=1.0 quant)
feature_layers.0._weight_quantizer      : TensorQuantizer(8bit fake per-tensor amax=0.5699 calibrator=MaxCalibrator scale=1.0 quant)
feature_layers.3._input_quantizer       : TensorQuantizer(8bit fake per-tensor amax=21.7810 calibrator=MaxCalibrator scale=1.0 quant)
feature_layers.3._weight_quantizer      : TensorQuantizer(8bit fake per-tensor amax=0.5960 calibrator=MaxCalibrator scale=1.0 quant)
feature_layers.7._input_quantizer       : TensorQuantizer(8bit fake per-tensor amax=9.4764 calibrator=MaxCalibrator scale=1.0 quant)
feature_layers.7._weight_quantizer      : TensorQuantizer(8bit fake per-tensor amax=0.7710 calibrator=MaxCalibrator scale=1.0 quant)
feature_layers.10._input_quantizer      : TensorQuantizer(8bit fake per-tensor amax=8.5900 calibrator=MaxCalibrator scale=1.0 quant)
feature_layers.10._weight_quantizer     : TensorQuantizer(8bit fake 

In [7]:
# quantization aware training (QAT)
pass_args = {
    "dataset": data_module,
    "learning_rate": 0.001,
    "max_iter": 100,
}

quantization_aware_training_pass(mg, pass_args)

RuntimeError: Event device type CUDA does not match blocking stream's device type CPU.

### Calibration

After quantization the model, we need to calibrate the model and get amax to each layer. We can use the following pass_args to contral the calibration process, including the method, the number of samples, and the calibration dataset.

### Evaluate


In [15]:
pass_args_eval = {
    "data_module": data_module,
}

mg = evaluate_pytorch_model_pass(mg, pass_args_eval)

Average execute time for one batch: 2.96ms
Total accuracy: 91.99%


### Model export

We first need to export the model to ONNX format, then to tensorrt engine.

In [16]:
#using tenssor quantization
pass_args = {
    "onnxFile": "onnx_a_3_1.onnx",
    "engineFile": "engine_a_3_1.plan",
    "dataloader": data_module.test_dataloader,
}
mg = graph_to_trt_pass(mg, pass_args)
test_trt_engine("engine_a_3_1.plan", data_module.test_dataloader)


verbose: False, log level: Level.ERROR

Succeeded finding ONNX file!
Succeeded parsing .onnx file!
Succeeded building engine!
engine.__len__() = 2
engine.__sizeof__() = 56
engine.__str__() = <tensorrt_bindings.tensorrt.ICudaEngine object at 0x7f2e5a3caef0>

inspector.execution_context= None
inspector.error_recorder= None
Engine information:
Layers:
/feature_layers.0/_input_quantizer/QuantizeLinear
feature_layers.0.weight + /feature_layers.0/_weight_quantizer/QuantizeLinear + /feature_layers.0/Conv
feature_layers.3.weight + /feature_layers.3/_weight_quantizer/QuantizeLinear + /feature_layers.3/Conv
/feature_layers.6/MaxPool
feature_layers.7.weight + /feature_layers.7/_weight_quantizer/QuantizeLinear + /feature_layers.7/Conv
feature_layers.10.weight + /feature_layers.10/_weight_quantizer/QuantizeLinear + /feature_layers.10/Conv
/feature_layers.13/MaxPool
feature_layers.14.weight + /feature_layers.14/_weight_quantizer/QuantizeLinear + /feature_layers.14/Conv
feature_layers.17.weight + /fe

### Quantization pass based 

In [17]:
pass_args = {
    "precision": 'int8',                                                     # collect weight statistics for linear layers
    "nCalibration": 10,                                                # collect activation statistics for relu layers
    "dummy_in": dummy_in,
    "input_generator": input_generator,                                      # the input generator for feeding data to the model
    "onnxFile": 'model_int8.onnx',
    "cacheFile": 'model_int8.INT8Cache',  
    "engineFile": 'model_int8.plan'
}
engine = quantize_tensorrt_transform_pass(mg, pass_args)
test_quantize_tensorrt_transform_pass(data_module.test_dataloader, pass_args['engineFile'])

  if amax.numel() == 1:


verbose: False, log level: Level.ERROR

Succeeded finding ONNX file!
Succeeded parsing .onnx file!
Succeeded building engine!
engine.__len__() = 2
engine.__sizeof__() = 56
engine.__str__() = <tensorrt_bindings.tensorrt.ICudaEngine object at 0x7f2e62651270>

inspector.execution_context= None
inspector.error_recorder= None
Engine information:
Layers:
/feature_layers.0/_input_quantizer/QuantizeLinear
feature_layers.0.weight + /feature_layers.0/_weight_quantizer/QuantizeLinear + /feature_layers.0/Conv
feature_layers.3.weight + /feature_layers.3/_weight_quantizer/QuantizeLinear + /feature_layers.3/Conv
/feature_layers.6/MaxPool
feature_layers.7.weight + /feature_layers.7/_weight_quantizer/QuantizeLinear + /feature_layers.7/Conv
feature_layers.10.weight + /feature_layers.10/_weight_quantizer/QuantizeLinear + /feature_layers.10/Conv
/feature_layers.13/MaxPool
feature_layers.14.weight + /feature_layers.14/_weight_quantizer/QuantizeLinear + /feature_layers.14/Conv
feature_layers.17.weight + /fe