In [2]:
import sys
import logging
import os
from pathlib import Path
from pprint import pprint as pp

# # figure out the correct path
machop_path = Path(".").resolve().parent.parent /"machop"
assert machop_path.exists(), "Failed to find machop at: {}".format(machop_path)
sys.path.append(str(machop_path))

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch_tensorrt

from torch.utils.tensorboard import SummaryWriter

import pytorch_quantization
from pytorch_quantization import nn as quant_nn
from pytorch_quantization import quant_modules
from pytorch_quantization.tensor_quant import QuantDescriptor
from pytorch_quantization import calib
from tqdm import tqdm

print(pytorch_quantization.__version__)

from chop.dataset import MaseDataModule, get_dataset_info
from chop.tools.logger import set_logging_verbosity

from chop.tools import get_cf_args, get_dummy_input, load_config
from chop.passes.graph import (
    save_node_meta_param_interface_pass,
    report_node_meta_param_analysis_pass,
    profile_statistics_analysis_pass,
    add_common_metadata_analysis_pass,
    init_metadata_analysis_pass,
    add_software_metadata_analysis_pass,
    quantize_tensorrt_transform_pass,
    test_quantize_tensorrt_transform_pass,
    fake_quantize_transform_pass,
    graph_calibration_pass,
    evaluate_fake_quantize_pass,
    fake_quantize_to_trt_pass
)
from chop.tools.get_input import InputGenerator
from chop.tools.checkpoint_load import load_model
from chop.ir import MaseGraph

from chop.models import get_model_info, get_model, get_tokenizer

set_logging_verbosity("info")


2.1.3


[32mINFO    [0m [34mSet logging level to info[0m
I0318 17:45:06.131911 140620945725248 logger.py:44] Set logging level to info


In [4]:
batch_size = 8
model_name = "vgg7"
dataset_name = "cifar10"

# batch_size = 1
# model_name = "facebook/opt-125m:patched"
# dataset_name = "cola"


data_module = MaseDataModule(
    name=dataset_name,
    batch_size=batch_size,
    model_name=model_name,
    num_workers=0,
)
data_module.prepare_data()
data_module.setup()

# 📝️ change this CHECKPOINT_PATH to the one you trained in Lab1
CHECKPOINT_PATH = "/home/qizhu/Desktop/Work/mase/mase_output/test-accu-0.9332.ckpt"
# CHECKPOINT_PATH = "/home/qizhu/Desktop/Work/mase/mase_output/opt125.ckpt"

model_info = get_model_info(model_name)
# quant_modules.initialize()
model = get_model(
    model_name,
    task="cls",
    dataset_info=data_module.dataset_info,
    pretrained=False)

model = load_model(load_name=CHECKPOINT_PATH, load_type="pl", model=model)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


[32mINFO    [0m [34mLoaded pytorch lightning checkpoint from /home/qizhu/Desktop/Work/mase/mase_output/test-accu-0.9332.ckpt[0m
I0318 17:45:35.741717 140620945725248 checkpoint_load.py:85] Loaded pytorch lightning checkpoint from /home/qizhu/Desktop/Work/mase/mase_output/test-accu-0.9332.ckpt


In [5]:
#transfer the model into mase graph
mg = MaseGraph(model=model)
ori_mg = MaseGraph(model=model)

# get the input generator
input_generator = InputGenerator(
    data_module=data_module,
    model_info=model_info,
    task="cls",
    which_dataloader="train",
)

# a demonstration of how to feed an input value to the model
dummy_in = next(iter(input_generator))
# _ = model(**dummy_in)

#add all the parameters to masegraph
mg, _ = init_metadata_analysis_pass(mg, None)
mg, _ = add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in})
mg, _ = add_software_metadata_analysis_pass(mg, None)

tensor([[[[-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
          [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
          [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
          ...,
          [ 0.5707, -1.0904, -1.1932,  ..., -2.1179, -2.1179, -2.1179],
          [ 1.3242,  0.0912, -0.9877,  ..., -2.1179, -2.1179, -2.1179],
          [ 1.5125,  1.2728,  0.3481,  ..., -2.1179, -2.1179, -2.1179]],

         [[-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          ...,
          [ 0.7304, -0.9153, -1.0028,  ..., -2.0357, -2.0357, -2.0357],
          [ 1.4832,  0.2577, -0.8102,  ..., -2.0357, -2.0357, -2.0357],
          [ 1.6583,  1.4307,  0.5028,  ..., -2.0357, -2.0357, -2.0357]],

         [[-1.8044, -1.8044, -1.8044,  ..., -1.8044, -1.8044, -1.8044],
          [-1.8044, -1.8044, -

### 

### Quantization pass based on pytorch-quantization

To achieve QAT, we could use fake quantization to simulate the quantization process. We could use the `FakeQuantize` argument to select it.

Here's an example of pass_args, we use name to specify the layer we want to quantize, achieving layer-wise PTQ, and we could select the specific calibrate method and precision.

In [None]:
## fake quantize the graph
pass_args = {
    "by": "name",
    "default": {"config": {"name": None}},
    "feature_layers_0": {
        "config": {
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "max",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "histogram",
                "quantize_axis": None,
            },
        }
    },    
    "classifier_0": {
        "config": {
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "histogram",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    },
    "classifier_1": {
        "config": {
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "histogram",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    },
    "classifier_2": {
        "config": {
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "histogram",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    },
    "classifier_3": {
        "config": {
            "name": "int",
            "input": {
                "precesion": 8,
                "calibrator": "histogram",
                "quantize_axis": None,
            },
            "weight": {
                "calibrator": "max",
                "quantize_axis": None,
            },
        }
    },
}

mg = fake_quantize_transform_pass(mg, pass_args)


### Calibration

After quantization the model, we need to calibrate the model and get amax to each layer. We can use the following pass_args to contral the calibration process, including the method, the number of samples, and the calibration dataset.

In [None]:
#the pass of calibration for tenssor_quantization
pass_args_calibrate = {
    "calibrator": "percentile",
    "percentiles": [99],
    "data_module": data_module,
    "num_batches": 100,
}

graph_calibration_pass(mg,  pass_args_calibrate)

### Evaluate


In [None]:
pass_args_eval = {
    "data_module": data_module,
}

mg = evaluate_fake_quantize_pass(mg, pass_args_eval)

### Model export

We first need to export the model to ONNX format, then to tensorrt engine.

In [None]:
#using tenssor quantization
pass_args = {
    "onnxFile": "onnx_test.onnx",
    "engineFile": "engine_test.plan",
    "dataloader": data_module.test_dataloader,
}
mg = fake_quantize_to_trt_pass(mg, pass_args)


### Quantization pass based 

In [6]:
pass_args = {
    "precision": 'int8',                                                     # collect weight statistics for linear layers
    "nCalibration": 10,                                                # collect activation statistics for relu layers
    "dummy_in": dummy_in,
    "input_generator": input_generator,                                      # the input generator for feeding data to the model
    "onnxFile": 'model.onnx',
    "cacheFile": 'model.INT8Cache',  
    "engineFile": 'model.plan'
}
engine = quantize_tensorrt_transform_pass(mg, pass_args)
test_quantize_tensorrt_transform_pass(data_module.test_dataloader, pass_args['engineFile'])

verbose: False, log level: Level.ERROR

Succeeded finding ONNX file!
Succeeded parsing .onnx file!
Failed finding int8 cache!
