### initialize model and dataset and mase graph


In [1]:
import sys
import logging
import os
from pathlib import Path
from pprint import pprint as pp

# # figure out the correct path
machop_path = Path(".").resolve().parent.parent /"machop"
assert machop_path.exists(), "Failed to find machop at: {}".format(machop_path)
sys.path.append(str(machop_path))

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch_tensorrt

from torch.utils.tensorboard import SummaryWriter

import pytorch_quantization
from pytorch_quantization import nn as quant_nn
from pytorch_quantization import quant_modules
from pytorch_quantization.tensor_quant import QuantDescriptor
from pytorch_quantization import calib
from tqdm import tqdm

print(pytorch_quantization.__version__)

from chop.dataset import MaseDataModule, get_dataset_info
from chop.tools.logger import set_logging_verbosity

from chop.tools import get_cf_args, get_dummy_input, load_config
from chop.passes.graph import (
    save_node_meta_param_interface_pass,
    report_node_meta_param_analysis_pass,
    profile_statistics_analysis_pass,
    add_common_metadata_analysis_pass,
    init_metadata_analysis_pass,
    add_software_metadata_analysis_pass,
    quantize_tensorrt_transform_pass,
    test_quantize_tensorrt_transform_pass,
    fake_quantize_transform_pass,
    graph_calibration_pass,
    evaluate_pytorch_model_pass,
    fake_quantize_to_trt_pass,
    mixed_precision_transform_pass
)
from chop.tools.get_input import InputGenerator
from chop.tools.checkpoint_load import load_model
from chop.ir import MaseGraph

from chop.models import get_model_info, get_model, get_tokenizer

set_logging_verbosity("info")


2.2.1


[32mINFO    [0m [34mSet logging level to info[0m
I0328 23:18:10.889825 140711983724352 logger.py:44] Set logging level to info


In [2]:
batch_size = 16
model_name = "vgg7"
dataset_name = "cifar10"

# batch_size = 1
# model_name = "facebook/opt-125m:patched"
# dataset_name = "cola"


data_module = MaseDataModule(
    name=dataset_name,
    batch_size=batch_size,
    model_name=model_name,
    num_workers=0,
)
data_module.prepare_data()
data_module.setup()

# 📝️ change this CHECKPOINT_PATH to the one you trained in Lab1
CHECKPOINT_PATH = "/home/qizhu/Desktop/Work/mase/mase_output/test-accu-0.9332.ckpt"
# CHECKPOINT_PATH = "/home/qizhu/Desktop/Work/mase/mase_output/opt125.ckpt"

model_info = get_model_info(model_name)
# quant_modules.initialize()
model = get_model(
    model_name,
    task="cls",
    dataset_info=data_module.dataset_info,
    pretrained=False)

model = load_model(load_name=CHECKPOINT_PATH, load_type="pl", model=model)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


[32mINFO    [0m [34mLoaded pytorch lightning checkpoint from /home/qizhu/Desktop/Work/mase/mase_output/test-accu-0.9332.ckpt[0m
I0328 23:18:15.460795 140711983724352 checkpoint_load.py:85] Loaded pytorch lightning checkpoint from /home/qizhu/Desktop/Work/mase/mase_output/test-accu-0.9332.ckpt


In [3]:
#transfer the model into mase graph
mg = MaseGraph(model=model)
ori_mg = MaseGraph(model=model)

# get the input generator
input_generator = InputGenerator(
    data_module=data_module,
    model_info=model_info,
    task="cls",
    which_dataloader="train",
)

# a demonstration of how to feed an input value to the model
dummy_in = next(iter(input_generator))
# _ = model(**dummy_in)

#add all the parameters to masegraph
mg, _ = init_metadata_analysis_pass(mg, None)
mg, _ = add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in})
mg, _ = add_software_metadata_analysis_pass(mg, None)

tensor([[[[ 0.0056,  0.0056,  0.0056,  ..., -0.9192, -0.8849, -0.9534],
          [ 0.0056,  0.0056,  0.0056,  ..., -1.0390, -1.0048, -0.9534],
          [ 0.0056,  0.0056,  0.0056,  ..., -0.8849, -1.0733, -1.0733],
          ...,
          [ 0.0056,  0.0056,  0.0056,  ..., -1.1418, -1.1247, -1.0562],
          [ 0.0056,  0.0056,  0.0056,  ..., -2.1179, -2.1179, -2.1179],
          [ 0.0056,  0.0056,  0.0056,  ..., -2.1179, -2.1179, -2.1179]],

         [[-0.0049, -0.0049, -0.0049,  ..., -0.2500, -0.1975, -0.2675],
          [-0.0049, -0.0049, -0.0049,  ..., -0.3725, -0.3375, -0.2850],
          [-0.0049, -0.0049, -0.0049,  ..., -0.2150, -0.4076, -0.3901],
          ...,
          [-0.0049, -0.0049, -0.0049,  ..., -0.7402, -0.7227, -0.6527],
          [-0.0049, -0.0049, -0.0049,  ..., -2.0357, -2.0357, -2.0357],
          [-0.0049, -0.0049, -0.0049,  ..., -2.0357, -2.0357, -2.0357]],

         [[ 0.0082,  0.0082,  0.0082,  ..., -0.2184, -0.1661, -0.2184],
          [ 0.0082,  0.0082,  

### 

### Quantization pass based on pytorch-quantization

To achieve QAT, we could use fake quantization to simulate the quantization process. We could use the `FakeQuantize` argument to select it.

Here's an example of pass_args, we use name to specify the layer we want to quantize, achieving layer-wise PTQ, and we could select the specific calibrate method and precision.

In [6]:
## fake quantize the graph
pass_args_mixed_precision = {
    "by": "name",
    "default": {"config": {"name": None}},
    "feature_layers_0": {
        "config": {
            "FakeQuantize": True,
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    },   
    "feature_layers_3": {
        "config": {
            "FakeQuantize": True,
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    },  
    "feature_layers_7": {
        "config": {
            "FakeQuantize": True,
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    }, 
    "feature_layers_10": {
        "config": {
            "FakeQuantize": True,
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    }, 
    "feature_layers_14": {
        "config": {
            "FakeQuantize": True,
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    }, 
    "feature_layers_17": {
        "config": {
            "FakeQuantize": True,
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    }, 
    "classifier_0": {
        "config": {
            "FakeQuantize": True,
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    },
    # "classifier_1": {
    #     "config": {
    #         "FakeQuantize": True,
    #         "name": "int",
    #         "input": {
    #             "precesion": 8,
    #             "calibrator": "histogram",
    #             "quantize_axis": None,
    #         },
    #         "weight": {
    #             "calibrator": "max",
    #             "quantize_axis": None,
    #         },
    #     }
    # },
    # "classifier_2": {
    #     "config": {
    #         "FakeQuantize": True,
    #         "name": "int",
    #         "input": {
    #             "precesion": 8,
    #             "calibrator": "max",
    #             "quantize_axis": None,
    #         },
    #         "weight": {
    #             "calibrator": "max",
    #             "quantize_axis": None,
    #         },
    #     }
    # },
    # "classifier_3": {
    #     "config": {
    #         "FakeQuantize": True,
    #         "name": "int",
    #         "input": {
    #             "precesion": 8,
    #             "calibrator": "max",
    #             "quantize_axis": None,
    #         },
    #         "weight": {
    #             "calibrator": "max",
    #             "quantize_axis": None,
    #         },
    #     }
    # },
}

pass_args_calibrate = {
    "calibrator": "",
    "percentiles": [99],
    "data_module": data_module,
    "num_batches": 100,
}


mg = mixed_precision_transform_pass(mg, pass_args_mixed_precision, pass_args_calibrate)

W0328 23:18:34.769930 140711983724352 tensor_quantizer.py:184] Disable MaxCalibrator
W0328 23:18:34.770513 140711983724352 tensor_quantizer.py:184] Disable MaxCalibrator
W0328 23:18:34.770795 140711983724352 tensor_quantizer.py:184] Disable MaxCalibrator
W0328 23:18:34.771353 140711983724352 tensor_quantizer.py:184] Disable MaxCalibrator
W0328 23:18:34.771770 140711983724352 tensor_quantizer.py:184] Disable MaxCalibrator
W0328 23:18:34.772243 140711983724352 tensor_quantizer.py:184] Disable MaxCalibrator
W0328 23:18:34.772717 140711983724352 tensor_quantizer.py:184] Disable MaxCalibrator
W0328 23:18:34.773186 140711983724352 tensor_quantizer.py:184] Disable MaxCalibrator
W0328 23:18:34.773706 140711983724352 tensor_quantizer.py:184] Disable MaxCalibrator
W0328 23:18:34.774156 140711983724352 tensor_quantizer.py:184] Disable MaxCalibrator
W0328 23:18:34.774955 140711983724352 tensor_quantizer.py:184] Disable MaxCalibrator
W0328 23:18:34.775382 140711983724352 tensor_quantizer.py:184] Di

feature_layers.0._input_quantizer       : TensorQuantizer(8bit fake per-tensor amax=4.8753 calibrator=MaxCalibrator scale=1.0 quant)
feature_layers.0._weight_quantizer      : TensorQuantizer(8bit fake per-tensor amax=0.5699 calibrator=MaxCalibrator scale=1.0 quant)
feature_layers.3._input_quantizer       : TensorQuantizer(8bit fake per-tensor amax=24.0892 calibrator=MaxCalibrator scale=1.0 quant)
feature_layers.3._weight_quantizer      : TensorQuantizer(8bit fake per-tensor amax=0.5960 calibrator=MaxCalibrator scale=1.0 quant)
feature_layers.7._input_quantizer       : TensorQuantizer(8bit fake per-tensor amax=12.1911 calibrator=MaxCalibrator scale=1.0 quant)
feature_layers.7._weight_quantizer      : TensorQuantizer(8bit fake per-tensor amax=0.7710 calibrator=MaxCalibrator scale=1.0 quant)
feature_layers.10._input_quantizer      : TensorQuantizer(8bit fake per-tensor amax=7.8324 calibrator=MaxCalibrator scale=1.0 quant)
feature_layers.10._weight_quantizer     : TensorQuantizer(8bit fake

### Calibration

After quantization the model, we need to calibrate the model and get amax to each layer. We can use the following pass_args to contral the calibration process, including the method, the number of samples, and the calibration dataset.

### Evaluate


In [10]:
pass_args_eval = {
    "data_module": data_module,
}

mg = evaluate_pytorch_model_pass(mg, pass_args_eval)

Average execute time for one batch: 2.83ms
Total accuracy: 92.52%


### Model export

We first need to export the model to ONNX format, then to tensorrt engine.

In [6]:
#using tenssor quantization
pass_args = {
    "onnxFile": "onnx_a_3_1.onnx",
    "engineFile": "engine_a_3_1.plan",
    "dataloader": data_module.test_dataloader,
}
mg = fake_quantize_to_trt_pass(mg, pass_args)


verbose: False, log level: Level.ERROR



RuntimeError: "slow_conv2d_cpu" not implemented for 'Half'

### Quantization pass based 

In [4]:
pass_args = {
    "precision": 'int8',                                                     # collect weight statistics for linear layers
    "nCalibration": 10,                                                # collect activation statistics for relu layers
    "dummy_in": dummy_in,
    "input_generator": input_generator,                                      # the input generator for feeding data to the model
    "onnxFile": 'model_int8.onnx',
    "cacheFile": 'model_int8.INT8Cache',  
    "engineFile": 'model_int8.plan'
}
engine = quantize_tensorrt_transform_pass(mg, pass_args)
test_quantize_tensorrt_transform_pass(data_module.test_dataloader, pass_args['engineFile'])

verbose: False, log level: Level.ERROR

Succeeded finding ONNX file!
Succeeded parsing .onnx file!
Succeed finding cahce file: model_int8.INT8Cache
Succeed finding cahce file: model_int8.INT8Cache
Succeeded building engine!
Succeed finding cahce file: model_int8.INT8Cache
Succeed finding cahce file: model_int8.INT8Cache
engine.__len__() = 2
engine.__sizeof__() = 56
engine.__str__() = <tensorrt_bindings.tensorrt.ICudaEngine object at 0x7f9a1d0fb530>

inspector.execution_context= None
inspector.error_recorder= None
Engine information:
Layers:
(Unnamed Layer* 5) [Shuffle]
/seq_blocks.0/BatchNormalization + /seq_blocks.1/Relu
/seq_blocks.2/Gemm
reshape_after_/seq_blocks.2/Gemm
PWN(/seq_blocks.3/Relu)

Bindings:
input
output

Layer information:
(Unnamed Layer* 5) [Shuffle]

/seq_blocks.0/BatchNormalization + /seq_blocks.1/Relu

/seq_blocks.2/Gemm

reshape_after_/seq_blocks.2/Gemm

PWN(/seq_blocks.3/Relu)

[ 0]Input -> DataType.FLOAT (-1, 16) (16, 16) input
[ 1]Output-> DataType.FLOAT (-1, 5

ZeroDivisionError: division by zero