In [1]:
import os
import sys
import time
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision
from torchvision import datasets
import torchvision.transforms as transforms

# # warnings 설정
import warnings
warnings.filterwarnings(
    action='ignore',
    category=DeprecationWarning,
    module=r'.*'
)
warnings.filterwarnings(
    action='default',
    module=r'torch.ao.quantization'
)

# 반복 가능한 결과를 위한 랜덤 시드 지정하기
torch.manual_seed(191009)

<torch._C.Generator at 0x1770a2894b0>

In [2]:
from torch.ao.quantization import QuantStub, DeQuantStub



In [3]:
class AverageMeter(object):
    """평균과 현재 값 계산 및 저장"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


def accuracy(output, target, topk=(1,)):
    """특정 k값을 위해 top k 예측의 정확도 계산"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res


def evaluate(model, criterion, data_loader, neval_batches):
    model.eval()
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    cnt = 0
    with torch.no_grad():
        for image, target in data_loader:
            output = model(image)
            loss = criterion(output, target)
            cnt += 1
            acc1, acc5 = accuracy(output, target, topk=(1, 2))
            print('.', end = '')
            top1.update(acc1[0], image.size(0))
            top5.update(acc5[0], image.size(0))
            if cnt >= neval_batches:
                 return top1, top5

    return top1, top5
def load_weights(model:nn.Module, model_path):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    checkpoint = torch.load(model_path, map_location=device)
    if checkpoint.get('state_dict'):
        model.load_state_dict(checkpoint['state_dict'])
    else:
        model.load_state_dict(checkpoint)
    return model

def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

In [4]:
from utils.dataloaders import get_dataloader
import torch
from torch import nn
class MCU_VGGRepC1(nn.Module):
    def __init__(self):
        super(MCU_VGGRepC1, self).__init__()
        self.STAGE0_CONV = nn.Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.STAGE0_RELU = nn.ReLU()
        self.STAGE1_0_CONV = nn.Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.STAGE1_0_RELU = nn.ReLU()
        self.STAGE2_0_CONV = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.STAGE2_0_RELU = nn.ReLU()
        self.STAGE3_0_CONV = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.STAGE3_0_RELU = nn.ReLU()
        self.STAGE4_0_CONV = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.STAGE4_0_RELU = nn.ReLU()
        self.GAP21 = nn.AdaptiveAvgPool2d(output_size=1)
        self.FLATTEN22 = nn.Flatten(start_dim=1, end_dim=-1)
        self.LINEAR = nn.Linear(in_features=128, out_features=2, bias=True)
        
    def forward(self, x):
        x = self.STAGE0_CONV(x)
        x = self.STAGE0_RELU(x)
        x = self.STAGE1_0_CONV(x)
        x = self.STAGE1_0_RELU(x)
        x = self.STAGE2_0_CONV(x)
        x = self.STAGE2_0_RELU(x)
        x = self.STAGE3_0_CONV(x)
        x = self.STAGE3_0_RELU(x)
        x = self.STAGE4_0_CONV(x)
        x = self.STAGE4_0_RELU(x)
        x = self.GAP21(x)
        x = self.FLATTEN22(x)
        x = self.LINEAR(x)
        
        return x
checkpoint = "E:/2_Quantization/deployment-with-CMSIS-NN/weights/mcu_vggrepc1_vww.pth"
model = MCU_VGGRepC1()
model = load_weights(model, checkpoint)
model = model.cpu()
recover_model = lambda : load_weights(model, checkpoint)
model

MCU_VGGRepC1(
  (STAGE0_CONV): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (STAGE0_RELU): ReLU()
  (STAGE1_0_CONV): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (STAGE1_0_RELU): ReLU()
  (STAGE2_0_CONV): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (STAGE2_0_RELU): ReLU()
  (STAGE3_0_CONV): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (STAGE3_0_RELU): ReLU()
  (STAGE4_0_CONV): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (STAGE4_0_RELU): ReLU()
  (GAP21): AdaptiveAvgPool2d(output_size=1)
  (FLATTEN22): Flatten(start_dim=1, end_dim=-1)
  (LINEAR): Linear(in_features=128, out_features=2, bias=True)
)

In [5]:
data_path = 'E:/1_TinyML/tiny/benchmark/training/visual_wake_words/vw_coco2014_96/'

train_batch_size = 50
eval_batch_size = 50

data_loader, data_loader_test = get_dataloader(dataset_dir=data_path, 
                                batch_size=train_batch_size, 
                                image_size=96, 
                                num_workers=4, 
                                shuffle=True)

criterion = nn.CrossEntropyLoss()
float_model = load_weights(model, checkpoint)

# 다음으로 "모듈 결합"을 합니다. 모듈 결합은 메모리 접근을 줄여 모델을 빠르게 만들면서
# 정확도 수치를 향상시킵니다. 모듈 결합은 어떠한 모델에라도 사용할 수 있지만,
# 양자화된 모델에 사용하는 것이 특히나 더 일반적입니다.



In [6]:
num_eval_batches = 1000

print("Size of baseline model")
print_size_of_model(float_model)

top1, top5 = evaluate(float_model, criterion, data_loader_test, neval_batches=num_eval_batches)
print('Evaluation accuracy on %d images, %2.2f'%(num_eval_batches * eval_batch_size, top1.avg))
# torch.jit.save(torch.jit.script(float_model), './weights/mcu_vggrepc1_jit.pth')

Size of baseline model
Size (MB): 0.403415
............................................................................................................................................................................................................................Evaluation accuracy on 50000 images, 86.31


In [14]:
num_calibration_batches = 32

from utils.dataloaders import get_dataloader
import torch
from torch import nn
checkpoint = "E:/2_Quantization/deployment-with-CMSIS-NN/weights/mcu_vggrepc1_vww.pth"
myModel = MCU_VGGRepC1()
# myModel = load_weights(model.cpu(), checkpoint)
myModel = myModel.cpu()
print(myModel)
# 양자화 설정 명시
# 간단한 min/max 범위 추정 및 텐서별 가중치 양자화로 시작
myModel.qconfig = torch.ao.quantization.default_qconfig
print(myModel.qconfig)
torch.ao.quantization.prepare(myModel, inplace=True)

# 첫 번째 보정
print('Post Training Quantization Prepare: Inserting Observers')

# 학습 세트로 보정
# evaluate(myModel, criterion, data_loader, neval_batches=num_calibration_batches)
print('Post Training Quantization: Calibration done')

# 양자화된 모델로 변환
torch.ao.quantization.convert(myModel.cpu(), inplace=True)
print('Post Training Quantization: Convert done')
print('\n Inverted Residual Block: After fusion and quantization, note fused modules: \n\n',myModel)

print("Size of model after quantization")
print_size_of_model(myModel)

# top1, top5 = evaluate(myModel.cpu(), criterion, data_loader_test, neval_batches=num_eval_batches)
print('Evaluation accuracy on %d images, %2.2f'%(num_eval_batches * eval_batch_size, top1.avg))

MCU_VGGRepC1(
  (STAGE0_CONV): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (STAGE0_RELU): ReLU()
  (STAGE1_0_CONV): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (STAGE1_0_RELU): ReLU()
  (STAGE2_0_CONV): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (STAGE2_0_RELU): ReLU()
  (STAGE3_0_CONV): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (STAGE3_0_RELU): ReLU()
  (STAGE4_0_CONV): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (STAGE4_0_RELU): ReLU()
  (GAP21): AdaptiveAvgPool2d(output_size=1)
  (FLATTEN22): Flatten(start_dim=1, end_dim=-1)
  (LINEAR): Linear(in_features=128, out_features=2, bias=True)
)
QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, quant_min=0, quant_max=127){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
Post Training



In [None]:
per_channel_quantized_model = load_model(saved_model_dir + float_model_file)
per_channel_quantized_model.eval()
per_channel_quantized_model.fuse_model()
# 이전의 'fbgemm' 또한 여전히 사용 가능하지만, 'x86'을 기본으로 사용하는 것을 권장합니다.
per_channel_quantized_model.qconfig = torch.ao.quantization.get_default_qconfig('x86')
print(per_channel_quantized_model.qconfig)

torch.ao.quantization.prepare(per_channel_quantized_model, inplace=True)
evaluate(per_channel_quantized_model,criterion, data_loader, num_calibration_batches)
torch.ao.quantization.convert(per_channel_quantized_model, inplace=True)
top1, top5 = evaluate(per_channel_quantized_model, criterion, data_loader_test, neval_batches=num_eval_batches)
print('Evaluation accuracy on %d images, %2.2f'%(num_eval_batches * eval_batch_size, top1.avg))
torch.jit.save(torch.jit.script(per_channel_quantized_model), saved_model_dir + scripted_quantized_model_file)

In [None]:
def train_one_epoch(model, criterion, optimizer, data_loader, device, ntrain_batches):
    model.train()
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    avgloss = AverageMeter('Loss', '1.5f')

    cnt = 0
    for image, target in data_loader:
        start_time = time.time()
        print('.', end = '')
        cnt += 1
        image, target = image.to(device), target.to(device)
        output = model(image)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        top1.update(acc1[0], image.size(0))
        top5.update(acc5[0], image.size(0))
        avgloss.update(loss, image.size(0))
        if cnt >= ntrain_batches:
            print('Loss', avgloss.avg)

            print('Training: * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
                  .format(top1=top1, top5=top5))
            return

    print('Full imagenet train set:  * Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f}'
          .format(top1=top1, top5=top5))
    return

In [None]:
qat_model = load_model(saved_model_dir + float_model_file)
qat_model.eval()
qat_model.fuse_model()

optimizer = torch.optim.SGD(qat_model.parameters(), lr = 0.0001)
# 이전의 'fbgemm' 또한 여전히 사용 가능하지만, 'x86'을 기본으로 사용하는 것을 권장합니다.
qat_model.qconfig = torch.ao.quantization.get_default_qat_qconfig('x86')

In [None]:
qat_model.train()
torch.ao.quantization.prepare_qat(qat_model, inplace=True)
print('Inverted Residual Block: After preparation for QAT, note fake-quantization modules \n',qat_model.features[1].conv)

In [None]:
num_train_batches = 20

# QAT는 시간이 걸리는 작업이며 몇 에폭에 걸쳐 훈련이 필요합니다.
# 학습 및 각 에폭 이후 정확도 확인
for nepoch in range(8):
    train_one_epoch(qat_model, criterion, optimizer, data_loader, torch.device('cpu'), num_train_batches)
    if nepoch > 3:
        # 양자화 파라미터 고정
        qat_model.apply(torch.ao.quantization.disable_observer)
    if nepoch > 2:
        # 배치 정규화 평균 및 분산 추정값 고정
        qat_model.apply(torch.nn.intrinsic.qat.freeze_bn_stats)

    # 각 에폭 이후 정확도 확인
    quantized_model = torch.ao.quantization.convert(qat_model.eval(), inplace=False)
    quantized_model.eval()
    top1, top5 = evaluate(quantized_model,criterion, data_loader_test, neval_batches=num_eval_batches)
    print('Epoch %d :Evaluation accuracy on %d images, %2.2f'%(nepoch, num_eval_batches * eval_batch_size, top1.avg))

In [None]:
def run_benchmark(model_file, img_loader):
    elapsed = 0
    model = torch.jit.load(model_file)
    model.eval()
    num_batches = 5
    # 이미지 배치들 이용하여 스크립트된 모델 실행
    for i, (images, target) in enumerate(img_loader):
        if i < num_batches:
            start = time.time()
            output = model(images)
            end = time.time()
            elapsed = elapsed + (end-start)
        else:
            break
    num_images = images.size()[0] * num_batches

    print('Elapsed time: %3.0f ms' % (elapsed/num_images*1000))
    return elapsed

run_benchmark(saved_model_dir + scripted_float_model_file, data_loader_test)

run_benchmark(saved_model_dir + scripted_quantized_model_file, data_loader_test)