In [1]:
import os
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms

import time
import copy
import numpy as np


In [2]:
def set_random_seeds(random_seed=0):

    torch.manual_seed(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

In [3]:
def prepare_dataloader(num_workers=8,
                       train_batch_size=128,
                       eval_batch_size=256):

    # train_transform = transforms.Compose([
    #     transforms.RandomCrop(32, padding=4),
    #     transforms.RandomHorizontalFlip(),
    #     transforms.ToTensor(),
    #     # transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    #     transforms.Normalize(mean=(0.485, 0.456, 0.406),
    #                          std=(0.229, 0.224, 0.225))
    # ])

    # test_transform = transforms.Compose([
    #     transforms.ToTensor(),
    #     # transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    #     transforms.Normalize(mean=(0.485, 0.456, 0.406),
    #                          std=(0.229, 0.224, 0.225))
    # ])

    transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5), (0.5))])

    batch_size = 16

    # train_set = torchvision.datasets.CIFAR10(root="data",
    #                                          train=True,
    #                                          download=True,
    #                                          transform=train_transform)
    train_set = torchvision.datasets.MNIST(root='./data', train=True,
                                        download=True, transform=transform)
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

    # We will use test set for validation and test in this project.
    # Do not use test set for validation in practice!
    # test_set = torchvision.datasets.CIFAR10(root="data",
    #                                         train=False,
    #                                         download=True,
    #                                         transform=test_transform)
    test_set = torchvision.datasets.MNIST(root='./data', train=False,
                                       download=True, transform=transform)
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

    train_sampler = torch.utils.data.RandomSampler(train_set)
    test_sampler = torch.utils.data.SequentialSampler(test_set)

    # train_loader = torch.utils.data.DataLoader(dataset=train_set,
    #                                            batch_size=train_batch_size,
    #                                            sampler=train_sampler,
    #                                            num_workers=num_workers)

    # test_loader = torch.utils.data.DataLoader(dataset=test_set,
    #                                           batch_size=eval_batch_size,
    #                                           sampler=test_sampler,
    #                                           num_workers=num_workers)

    classes = ('0', '1', '2', '3',
           '4', '5', '6', '7', '8', '9')

    return train_loader, test_loader

In [4]:
def evaluate_model(model, test_loader, device, criterion=None):

    model.eval()
    model.to(device)

    running_loss = 0
    running_corrects = 0

    for inputs, labels in test_loader:

        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)

        if criterion is not None:
            loss = criterion(outputs, labels).item()
        else:
            loss = 0

        # statistics
        running_loss += loss * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)

    eval_loss = running_loss / len(test_loader.dataset)
    eval_accuracy = running_corrects / len(test_loader.dataset)

    return eval_loss, eval_accuracy

In [5]:
def train_model(model,
                train_loader,
                test_loader,
                device,
                learning_rate=1e-1,
                num_epochs=200):

    # The training configurations were not carefully selected.

    criterion = nn.CrossEntropyLoss()

    model.to(device)

    # It seems that SGD optimizer is better than Adam optimizer for ResNet18 training on CIFAR10.
    optimizer = optim.SGD(model.parameters(),
                          lr=learning_rate,
                          momentum=0.9,
                          weight_decay=1e-4)
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=500)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                     milestones=[100, 150],
                                                     gamma=0.1,
                                                     last_epoch=-1)
    # optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)

    # Evaluation
    model.eval()
    eval_loss, eval_accuracy = evaluate_model(model=model,
                                              test_loader=test_loader,
                                              device=device,
                                              criterion=criterion)
    print("Epoch: {:03d} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(
        0, eval_loss, eval_accuracy))

    for epoch in range(num_epochs):

        # Training
        model.train()

        running_loss = 0
        running_corrects = 0

        for inputs, labels in train_loader:

            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # statistics
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        train_loss = running_loss / len(train_loader.dataset)
        train_accuracy = running_corrects / len(train_loader.dataset)

        # Evaluation
        model.eval()
        eval_loss, eval_accuracy = evaluate_model(model=model,
                                                  test_loader=test_loader,
                                                  device=device,
                                                  criterion=criterion)

        # Set learning rate scheduler
        scheduler.step()

        print(
            "Epoch: {:03d} Train Loss: {:.3f} Train Acc: {:.3f} Eval Loss: {:.3f} Eval Acc: {:.3f}"
            .format(epoch + 1, train_loss, train_accuracy, eval_loss,
                    eval_accuracy))
        
    print("End Training")
    return model

In [6]:
def calibrate_model(model, loader, device=torch.device("cpu:0")):

    model.to(device)
    model.eval()

    for inputs, labels in loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        _ = model(inputs)

In [7]:
def measure_inference_latency(model,
                              device,
                              input_size=(1, 1, 28, 28),
                              num_samples=100,
                              num_warmups=10):

    model.to(device)
    model.eval()

    x = torch.rand(size=input_size).to(device)

    with torch.no_grad():
        for _ in range(num_warmups):
            _ = model(x)
    torch.cuda.synchronize()

    with torch.no_grad():
        start_time = time.time()
        for _ in range(num_samples):
            _ = model(x)
            torch.cuda.synchronize()
        end_time = time.time()
    elapsed_time = end_time - start_time
    elapsed_time_ave = elapsed_time / num_samples

    return elapsed_time_ave

In [8]:
def save_model(model, model_dir, model_filename):

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_filepath = os.path.join(model_dir, model_filename)
    torch.save(model.state_dict(), model_filepath)


def load_model(model, model_filepath, device):

    model.load_state_dict(torch.load(model_filepath, map_location=device))

    return model


def save_torchscript_model(model, model_dir, model_filename):

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_filepath = os.path.join(model_dir, model_filename)
    torch.jit.save(torch.jit.script(model), model_filepath)


def load_torchscript_model(model_filepath, device):

    model = torch.jit.load(model_filepath, map_location=device)

    return model

In [9]:
class M(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # # QuantStub converts tensors from floating point to quantized
        # self.quant = torch.quantization.QuantStub()
        # self.conv = torch.nn.Conv2d(3, 16, 1)
        # self.bn = torch.nn.BatchNorm2d(1)
        # self.relu = torch.nn.ReLU()
        self.conv1 = torch.nn.Conv2d(1, 32, 3)
        self.bn1 = torch.nn.BatchNorm2d(32)
        self.relu1 = torch.nn.ReLU()
        self.conv2 = torch.nn.Conv2d(32, 32, 3, padding=(2,2))
        self.bn2 = torch.nn.BatchNorm2d(32)
        self.relu2 = torch.nn.ReLU()
        self.conv3 = torch.nn.Conv2d(32, 10, 1)
        self.bn3 = torch.nn.BatchNorm2d(10)
        self.relu3 = torch.nn.ReLU()
        self.avgpool = torch.nn.AdaptiveAvgPool2d((1,1))
        # DeQuantStub converts tensors from quantized to floating point
        # self.dequant = torch.quantization.DeQuantStub()

    def forward(self, x):
        # x = self.quant(x)
        #x = torch.flatten(x, 1)
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        # x = self.dequant(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        return x

In [10]:
def create_model(num_classes=10):

    # The number of channels in ResNet18 is divisible by 8.
    # This is required for fast GEMM integer matrix multiplication.
    # model = torchvision.models.resnet18(pretrained=False)
    # model = resnet18(num_classes=num_classes, pretrained=False)
    model = M()

    # We would use the pretrained ResNet18 as a feature extractor.
    # for param in model.parameters():
    #     param.requires_grad = False

    # Modify the last FC layer
    # num_features = model.fc.in_features
    # model.fc = nn.Linear(num_features, 10)

    return model

In [11]:
class QuantizedM(nn.Module):
    def __init__(self, model_fp32):

        super(QuantizedM, self).__init__()
        # QuantStub converts tensors from floating point to quantized.
        # This will only be used for inputs.
        self.quant = torch.quantization.QuantStub()
        # DeQuantStub converts tensors from quantized to floating point.
        # This will only be used for outputs.
        self.dequant = torch.quantization.DeQuantStub()
        # FP32 model
        self.model_fp32 = model_fp32

    def forward(self, x):
        # manually specify where tensors will be converted from floating
        # point to quantized in the quantized model
        x = self.quant(x)
        x = self.model_fp32(x)
        # manually specify where tensors will be converted from quantized
        # to floating point in the quantized model
        x = self.dequant(x)
        return x


In [12]:
def model_equivalence(model_1,
                      model_2,
                      device,
                      rtol=1e-05,
                      atol=1e-08,
                      num_tests=100,
                      input_size=(1, 1, 28, 28)):

    model_1.to(device)
    model_2.to(device)

    for _ in range(num_tests):
        x = torch.rand(size=input_size).to(device)
        y1 = model_1(x).detach().cpu().numpy()
        y2 = model_2(x).detach().cpu().numpy()
        if np.allclose(a=y1, b=y2, rtol=rtol, atol=atol,
                       equal_nan=False) == False:
            print("Model equivalence test sample failed: ")
            print(y1)
            print(y2)
            return False

    return True

In [13]:
def clip(input, qbit):
  max_value = 2. ** (qbit-1) -1.
  min_value = -2. ** (qbit-1)
  output = np.clip(input, min_value, max_value)
  return output

def uniform_quantize(input, qbit):
  min_value = min(input)
  max_value = max(input)
  abs_min_value = abs(min_value)
  abs_max_value = abs(max_value)
  if (abs_max_value >= abs_min_value):
    min_cond = -abs_max_value
    max_cond = abs_max_value
  else:
    min_cond = -abs_min_value
    max_cond = abs_min_value
  qmin = 0
  qmax = 2. ** qbit - 1.
  scale = (max_cond - min_cond) / (qmax - qmin)
  output = input / scale
  output = np.floor(output)
  output = clip(input=output, qbit=qbit)
  return output, scale

In [14]:
def main():

    random_seed = 0
    num_classes = 10
    cuda_device = torch.device("cuda:0")
    cpu_device = torch.device("cpu:0")

    model_dir = "saved_models"
    model_filename = "mnist.pt"
    quantized_model_filename = "mnist_quantized.pt"
    model_filepath = os.path.join(model_dir, model_filename)
    quantized_model_filepath = os.path.join(model_dir,
                                            quantized_model_filename)

    set_random_seeds(random_seed=random_seed)

    # Create an untrained model.
    model = create_model(num_classes=num_classes)

    train_loader, test_loader = prepare_dataloader(num_workers=8,
                                                   train_batch_size=128,
                                                   eval_batch_size=256)

    # Train model.
    print("Training Model...")
    
    model = train_model(model=model,
                        train_loader=train_loader,
                        test_loader=test_loader,
                        device=cuda_device,
                        learning_rate=1e-1,
                        num_epochs=2)
    # Save model.
    save_model(model=model, model_dir=model_dir, model_filename=model_filename)
    
    # Load a pretrained model.
    model = load_model(model=model,
                       model_filepath=model_filepath,
                       device=cuda_device) 
    pt = torch.load(model_filepath)
    print(type(pt))
    '''
    cnt_extract = 0
    for i, v in enumerate(pt.keys()):
        fp = 'w' + str(cnt_extract) + '.' + v + '.csv'
        fp_scale = 'w' + str(cnt_extract) + '.' + v + '_scale.csv'
        print(pt[v].cpu().shape)
        if v.find("num_batches_tracked") == -1:
            _w = pt[v].cpu()
            if len(_w.shape) == 2:
              _reshape = np.reshape(_w, (_w.shape[0] * _w.shape[1]))
            elif len(_w.shape) == 1:
              _reshape = np.reshape(_w, (_w.shape))
            elif len(_w.shape) == 4:
              _reshape = np.reshape(_w, (_w.shape[0] * _w.shape[1] * _w.shape[2] * _w.shape[3]))
            else:
              print('unknown arch')
            _reshape_q, _scale_w = uniform_quantize(input=_reshape, qbit=8)
        np.savetxt(fp, _reshape_q, delimiter=',')
        np.savetxt(fp_scale, [_scale_w], delimiter=',')
        cnt_extract +=1
    '''
    # Move the model to CPU since static quantization does not support CUDA currently.
    model.to(cpu_device)
        
    # Make a copy of the model for layer fusion
    fused_model = copy.deepcopy(model)
    
    #model.train()
    
    fused_model.eval()
    # The model has to be switched to training mode before any layer fusion.
    # Otherwise the quantization aware training will not work correctly.
    # fused_model.train()
    print(fused_model)

    # Fuse the model in place rather manually.
    fused_model = torch.quantization.fuse_modules(fused_model,
                                                  [["conv1", "bn1", "relu1"], ["conv2","bn2","relu2"], ["conv3","bn3","relu3"]],
                                                  inplace=True)
    # fused_model = torch.quantization.fuse_modules(fused_model,
    #                                               [["conv1", "bn1", "relu"]],
    #                                               inplace=True)
    # for module_name, module in fused_model.named_children():
    #     if "layer" in module_name:
    #         for basic_block_name, basic_block in module.named_children():
    #             torch.quantization.fuse_modules(
    #                 basic_block, [["conv1", "bn1", "relu1"], ["conv2", "bn2"]],
    #                 inplace=True)
    #             for sub_block_name, sub_block in basic_block.named_children():
    #                 if sub_block_name == "downsample":
    #                     torch.quantization.fuse_modules(sub_block,
    #                                                     [["0", "1"]],
    #                                                     inplace=True)

    # Print FP32 model.
    print(model)
    # Print fused model.
    print(fused_model)

    # Model and fused model should be equivalent.
    model.eval()
    fused_model.eval()
    assert model_equivalence(
        model_1=model,
        model_2=fused_model,
        device=cpu_device,
        rtol=1e-03,
        atol=1e-06,
        num_tests=100,
        input_size=(
            1, 1, 28,
            28)), "Fused model is not equivalent to the original model!"

    # Prepare the model for quantization aware training. This inserts observers in
    # the model that will observe activation tensors during calibration.
    quantized_model = QuantizedM(model_fp32=fused_model)
    # Using un-fused model will fail.
    # Because there is no quantized layer implementation for a single batch normalization layer.
    # quantized_model = QuantizedResNet18(model_fp32=model)
    # Select quantization schemes from
    # https://pytorch.org/docs/stable/quantization-support.html
    quantization_config = torch.quantization.get_default_qconfig("fbgemm")
    # Custom quantization configurations
    # quantization_config = torch.quantization.default_qconfig
    # quantization_config = torch.quantization.QConfig(activation=torch.quantization.MinMaxObserver.with_args(dtype=torch.quint8), weight=torch.quantization.MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric))

    quantized_model.qconfig = quantization_config

    # Print quantization configurations
    print(quantized_model.qconfig)

    # https://pytorch.org/docs/stable/_modules/torch/quantization/quantize.html#prepare_qat
    torch.quantization.prepare_qat(quantized_model, inplace=True)

    # # Use training data for calibration.
    print("Training QAT Model...")
    quantized_model.train()
    train_model(model=quantized_model,
                train_loader=train_loader,
                test_loader=test_loader,
                device=cuda_device,
                learning_rate=1e-3,
                num_epochs=2)
    quantized_model.to(cpu_device)

    # Using high-level static quantization wrapper
    # The above steps, including torch.quantization.prepare, calibrate_model, and torch.quantization.convert, are also equivalent to
    # quantized_model = torch.quantization.quantize_qat(model=quantized_model, run_fn=train_model, run_args=[train_loader, test_loader, cuda_device], mapping=None, inplace=False)

    quantized_model = torch.quantization.convert(quantized_model, inplace=True)

    quantized_model.eval()

    # Print quantized model.
    print(quantized_model)

    # Save quantized model.
    save_torchscript_model(model=quantized_model,
                           model_dir=model_dir,
                           model_filename=quantized_model_filename)

    # Load quantized model.
    quantized_jit_model = load_torchscript_model(
        model_filepath=quantized_model_filepath, device=cpu_device)
    
    #Extract Weights, bias
    pt_qat = torch.load(quantized_model_filepath)
    print(type(pt_qat))
    extract_cnt = 0
    wdir = "/home/aipslab1/beomsu1000/qat_example/saved_weights/"
    
    for index, weight in enumerate(quantized_model.state_dict()):
        np.set_printoptions(suppress=True)
        _w = quantized_model.state_dict()[weight].detach().cpu()
        print(weight, len(_w.shape))
        if len(_w.shape) == 1:
            _reshape = torch.reshape(_w, (_w.shape))
        elif len(_w.shape) == 4:
            _reshpae = torch.reshape(_w, (-1,))
        elif len(_w.shape) == 0:
            _reshpae = torch.reshape(_w, (-1,))
        else:
            print("unknown type")
        _reshape_q, _scale_w = uniform_quantize(input= _reshape, qbit=8)
        if(len(weight) > 15):
            fp = wdir + "w" + str(extract_cnt) + weight[10:] + ".csv"
            fp_scale = wdir + "w" + str(extract_cnt) + weight[10:] + "_scale.csv"
        else:
            fp = wdir + "w" + str(extract_cnt) + "." + weight + ".csv"
            fp_scale = wdir + "w" + str(extract_cnt) + "." + weight + "_scale.csv"
        np.savetxt(fp, _reshape_q, delimiter = ",")
        np.savetxt(fp_scale, [_scale_w], delimiter=",")   
        extract_cnt += 1
        
    #converting to csv
    
    # dataset-ubyte to csv
    def convert(imgf, labelf, outf, n):
        f = open(imgf, "rb")
        o = open(outf, "w")
        l = open(labelf, "rb")

        f.read(16)
        l.read(8)
        images = []

        for i in range(n):
            image = [ord(l.read(1))]
            for j in range(28*28):
                image.append(ord(f.read(1)))
            images.append(image)

        for image in images:
            o.write(",".join(str(pix) for pix in image)+"\n")
        f.close()
        o.close()
        l.close()

    rawdir = "/home/aipslab1/beomsu1000/data/MNIST/raw/"
    convertdir = wdir + "mnist_test.csv"
    if os.path.exists(convertdir) == False:
        convert(rawdir + "t10k-images-idx3-ubyte", rawdir + "t10k-labels-idx1-ubyte",
            wdir + "mnist_test.csv", 10000)
        print("Convert Finished!")
    else:
        print(convertdir + ": exists")
        
        
    
    
    #print each model accuarcy
    
    _, fp32_eval_accuracy = evaluate_model(model=model,
                                           test_loader=test_loader,
                                           device=cpu_device,
                                           criterion=None)
    _, int8_eval_accuracy = evaluate_model(model=quantized_jit_model,
                                           test_loader=test_loader,
                                           device=cpu_device,
                                           criterion=None)

    # Skip this assertion since the values might deviate a lot.
    # assert model_equivalence(model_1=model, model_2=quantized_jit_model, device=cpu_device, rtol=1e-01, atol=1e-02, num_tests=100, input_size=(1,3,32,32)), "Quantized model deviates from the original model too much!"

    print("FP32 evaluation accuracy: {:.3f}".format(fp32_eval_accuracy))
    print("INT8 evaluation accuracy: {:.3f}".format(int8_eval_accuracy))
    

    fp32_cpu_inference_latency = measure_inference_latency(
        model=model,                                                   
        device=cpu_device,                                                   
        input_size=(32, 1,3, 3),
        num_samples=100)
    
    int8_cpu_inference_latency = measure_inference_latency(
        model=quantized_model,
        device=cpu_device,
        input_size=(32,1,3,3),
        num_samples=100)
    
    int8_jit_cpu_inference_latency = measure_inference_latency(
        model=quantized_jit_model,
        device=cpu_device,
        input_size=(32,1,3,3),
        num_samples=100)
    
    fp32_gpu_inference_latency = measure_inference_latency(
        model=model,
        device=cuda_device,
        input_size=(32, 1,3, 3),
        num_samples=100)

    print("FP32 CPU Inference Latency: {:.2f} ms / sample".format(
        fp32_cpu_inference_latency * 1000))
    print("FP32 CUDA Inference Latency: {:.2f} ms / sample".format(
        fp32_gpu_inference_latency * 1000))
    print("INT8 CPU Inference Latency: {:.2f} ms / sample".format(
        int8_cpu_inference_latency * 1000))
    print("INT8 JIT CPU Inference Latency: {:.2f} ms / sample".format(
        int8_jit_cpu_inference_latency * 1000))


if __name__ == "__main__":

    main()

Training Model...
Epoch: 000 Eval Loss: 2.301 Eval Acc: 0.103
Epoch: 001 Train Loss: 0.816 Train Acc: 0.797 Eval Loss: 1.062 Eval Acc: 0.620
Epoch: 002 Train Loss: 0.378 Train Acc: 0.916 Eval Loss: 0.354 Eval Acc: 0.904
End Training
<class 'collections.OrderedDict'>
M(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU()
  (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
  (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU()
  (conv3): Conv2d(32, 10, kernel_size=(1, 1), stride=(1, 1))
  (bn3): BatchNorm2d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): ReLU()
  (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))
)
M(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (rel



Epoch: 000 Eval Loss: 0.355 Eval Acc: 0.904
Epoch: 001 Train Loss: 0.181 Train Acc: 0.953 Eval Loss: 0.159 Eval Acc: 0.956
Epoch: 002 Train Loss: 0.158 Train Acc: 0.956 Eval Loss: 0.129 Eval Acc: 0.964
End Training
QuantizedM(
  (quant): Quantize(scale=tensor([0.0157]), zero_point=tensor([64]), dtype=torch.quint8)
  (dequant): DeQuantize()
  (model_fp32): M(
    (conv1): QuantizedConvReLU2d(1, 32, kernel_size=(3, 3), stride=(1, 1), scale=0.05050439015030861, zero_point=0)
    (bn1): Identity()
    (relu1): Identity()
    (conv2): QuantizedConvReLU2d(32, 32, kernel_size=(3, 3), stride=(1, 1), scale=0.08372984081506729, zero_point=0, padding=(2, 2))
    (bn2): Identity()
    (relu2): Identity()
    (conv3): QuantizedConvReLU2d(32, 10, kernel_size=(1, 1), stride=(1, 1), scale=2.495201349258423, zero_point=0)
    (bn3): Identity()
    (relu3): Identity()
    (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))
  )
)
<class 'torch.jit._script.RecursiveScriptModule'>
quant.scale 1
quant.zero_poi



Convert Finished!
FP32 evaluation accuracy: 0.904
INT8 evaluation accuracy: 0.949
FP32 CPU Inference Latency: 2.81 ms / sample
FP32 CUDA Inference Latency: 0.54 ms / sample
INT8 CPU Inference Latency: 0.48 ms / sample
INT8 JIT CPU Inference Latency: 0.18 ms / sample
