참고


> https://github.com/ruihangdu/PyTorch-Deep-Compression



라이브러리 임포트

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets
from torch.autograd import Variable
from torch.nn.parameter import Parameter
import numpy as np
import os, math, time
import matplotlib.pyplot as plt
%matplotlib inline

3개의 컨볼루션 층, 1개의 완전 연결 층을 가진 ToyNet 정의 

In [None]:
class ToyNet(nn.Module):
  def __init__(self):
    super(ToyNet, self).__init__()
    self.layer1 = nn.Sequential(
        nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1, bias=False),
        nn.BatchNorm2d(16),
        nn.ReLU(inplace=True)
    )
    self.layer2 = nn.Sequential(
        nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, bias=False),
        nn.BatchNorm2d(32),
        nn.ReLU(inplace=True)
    )
    self.layer3 = nn.Sequential(
        nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1, bias=False),
        nn.BatchNorm2d(64),
        nn.ReLU(inplace=True)
    )
    self.fc = nn.Sequential(
        nn.Linear(8*8*64, 10)
    )
    for m in self.modules():
      if isinstance(m, nn.Conv2d):
        n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
        m.weight.data.normal_(0, math.sqrt(2./n))
      elif isinstance(m, nn.BatchNorm2d):
        m.weight.data.fill_(1)
        m.bias.data.zero_()

  def forward(self, x):
    out = self.layer1(x) #32x32
    out = self.layer2(out) #16x16
    out = self.layer3(out) #8x8

    out = out.view(x.size(0), -1)
    out = self.fc(out)
    
    return out

컨볼루션층에 프루닝을 하기 위해 MaskConv2d를 정의. 

전방계산을 할 때 기존의 weight에다가 mask를 씌우는 형태로 프루닝을 진행

이후 위에서 정의한 ToyNet의 convolution층을 MaskConv로 바꿔줌

In [None]:
class MaskConv2d(nn.Conv2d):
  def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                 padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros'):
    super(MaskConv2d, self).__init__(in_channels, out_channels, kernel_size, stride,
                                     padding, dilation, groups, bias, padding_mode)
    # self.mask = ''' implement ''' # add one tensor with Parameter and don't need gradient value
    self.alpha = 0.2 # pruning ratio
    #>>> torch.ge(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]])) tensor([[True, True], [False, True]])
    self.mask = torch.ge(self.weight.data.abs(), self.alpha* self.weight.data.std()).type('torch.FloatTensor')
    self.mask = self.mask.cuda()

  def forward(self, x):
    ### static
    # print(self.weight.data)
    # print(self.mask)
    self.weight.data = self.weight.data *self.mask
    # print(self.weight.data)
    return super(MaskConv2d, self)._conv_forward(x, self.weight, self.bias)
    
    ### dynamic -> preserve original weight value
    # masked_weight = self.weight.data * self.masked_weight
    # return super(MaskConv2d, self)._conv_forward(x, masked_weight)


class MaskToyNet(nn.Module):
  def __init__(self):
    super(MaskToyNet, self).__init__()
    self.layer1 = nn.Sequential(
        MaskConv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1, bias=False),
        nn.BatchNorm2d(16),
        nn.ReLU(inplace=True)
    )
    self.layer2 = nn.Sequential(
        MaskConv2d(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, bias=False),
        nn.BatchNorm2d(32),
        nn.ReLU(inplace=True)
    )
    self.layer3 = nn.Sequential(
        MaskConv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1, bias=False),
        nn.BatchNorm2d(64),
        nn.ReLU(inplace=True)
    )
    self.fc = nn.Sequential(
        nn.Linear(8*8*64, 10)
    )
    for m in self.modules():
      if isinstance(m, nn.Conv2d):
        n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
        m.weight.data.normal_(0, math.sqrt(2./n))
      elif isinstance(m, nn.BatchNorm2d):
        m.weight.data.fill_(1)
        m.bias.data.zero_()

  def forward(self, x):
    out = self.layer1(x) #32x32
    out = self.layer2(out) #16x16
    out = self.layer3(out) #8x8

    out = out.view(x.size(0), -1)
    out = self.fc(out)
    
    return out

Cifar 10에 데이터를 받아오기 위해 드라이브 마운트
이후 입력을 nomalize하기 위해 Cifar 10 이미지의 RGB mean과 std를 넣고 DA진행

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls '/content/drive/MyDrive/Colab Notebooks/data'

cifar-10-batches-py  cifar-10-python.tar.gz


In [None]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transform_train = transforms.Compose([transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), transforms.ToTensor(), normalize])
transform_valid = transforms.Compose([transforms.ToTensor(), normalize])

In [None]:
trainset = datasets.CIFAR10('/content/drive/MyDrive/Colab Notebooks/data', train=True, transform=transform_train, download=True)
validset = datasets.CIFAR10('/content/drive/MyDrive/Colab Notebooks/data', train=False, transform=transform_valid, download=True)

Files already downloaded and verified
Files already downloaded and verified


pytorch의 DataLoaader를 이용해 받아온 Cifar 10이미지를 미니배치 크기 256 단위로 나눠줌

In [None]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=256, shuffle=True)
validloader = torch.utils.data.DataLoader(validset, batch_size=256, shuffle=True)

In [None]:
class AverageMeter(object):
  def __init__(self, name, fmt=':f'):
    self.name = name
    self.fmt = fmt
    self.reset()
  
  def reset(self):
    self.val = 0
    self.avg = 0
    self.sum = 0
    self.count = 0
  
  def update(self, val, n=1):
    self.val = val
    self.sum += val * n
    self.count += n
    self.avg = self.sum / self.count

  def __str__(self):
    fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
    return fmtstr.format(**self.__dict__)

In [None]:
class ProgressMeter(object):
  def __init__(self, num_batches, *meters, prefix=""):
    self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
    self.meters = meters
    self.prefix = prefix

  def print(self, batch):
    entries = [self.prefix + self.batch_fmtstr.format(batch)]
    entries += [str(meter) for meter in self.meters]
    print('\t'.join(entries))

  def _get_batch_fmtstr(self, num_batches):
    num_digits = len(str(num_batches // 1))
    fmt = '{:' + str(num_digits) + 'd}'
    return '[' + fmt + '/' + fmt.format(num_batches) + ']'
    

In [None]:
def adjust_learning_rate(optimi, epoch, lr):
  if epoch>=100:
    0.02
  for param_group in optimi.param_groups:
    param_group['lr'] = lr

In [None]:
def accuracy(output, target, topk=(1,)):
  with torch.no_grad():
    maxk = max(topk)
    batch_size = target.size(0)
    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1,-1).expand_as(pred))

    res = []
    for k in topk:
      correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
      res.append(correct_k.mul_(100.0 / batch_size))
    return res

In [None]:
def train(train_loader, **kwargs):
  epoch = kwargs.get('epoch')
  model = kwargs.get('model')
  criterion = kwargs.get('criterion')
  optimizer = kwargs.get('optimizer')

  batch_time = AverageMeter('Time', ':6.3f')
  data_time = AverageMeter('Data', ':6.3f')
  losses = AverageMeter('Loss', ':.4e')
  top1 = AverageMeter('Acc@1', ':6.2f')
  top5 = AverageMeter('Acc@5', ':6.2f')
  progress = ProgressMeter(len(train_loader), batch_time, data_time, losses, top1, top5, prefix="Epoch:[{}]".format(epoch))
  
  
  model.train()

  end = time.time()
  for i, (input, target) in enumerate(train_loader):
    data_time.update(time.time() - end)
    input = Variable(input).cuda()
    target = Variable(target).cuda()

    output = model(input)
    loss = criterion(output, target)

    acc1, acc5 = accuracy(output, target, topk=(1,5))
    losses.update(loss.item(), input.size(0))
    top1.update(acc1[0], input.size(0))
    top5.update(acc5[0], input.size(0))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    batch_time.update(time.time() - end)

    if i % 100==0:
      progress.print(i)

  print('====> Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(top1=top1, top5=top5))

  return top1.avg, top5.avg

In [None]:
def validate(val_loader, model, criterion):
  batch_time = AverageMeter('Time', ':6.3f')
  losses = AverageMeter('Loss', ':.4e')
  top1 = AverageMeter('Acc@1', ':6.2f')
  top5 = AverageMeter('Acc@5', ':6.2f')
  progress = ProgressMeter(len(val_loader), batch_time, losses, top1, top5, prefix='Test: ')

  model.eval()

  with torch.no_grad():
    end = time.time()
    for i, (input, target) in enumerate(val_loader):
      input = Variable(input).cuda()
      target = target.cuda(non_blocking=True)

      output = model(input)
      loss = criterion(output, target)

      acc1, acc5 = accuracy(output, target, topk=(1,5))
      losses.update(loss.item(), input.size(0))
      top1.update(acc1[0], input.size(0))
      top5.update(acc5[0], input.size(0))

      batch_time.update(time.time() - end)

      if i % 100 == 0:
        progress.print(i)

      end = time.time()

    print('====> Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(top1=top1, top5=top5))

  return top1.avg, top5.avg

마스킹을 진행한 parameter의 경우 optimizer로 weight를 update할 필요가 없으므로 
weight에 mask가 없는 parameter에 대해서만 update

In [None]:
maskedmodel = MaskToyNet().cuda()
optimizer = optim.Adam([param for name, param in maskedmodel.named_parameters() if 'mask' not in name]) # list of all parameter except mask tensor
criterion = nn.CrossEntropyLoss().cuda()

In [None]:
lr = 1e-4
best_acc1 = 0
best_acc5 = 0

for epoch in range(5):
  adjust_learning_rate(optimizer, epoch, lr)

  print("Epoch : {}, lr : {}".format(epoch, optimizer.param_groups[0]['lr']))
  print('===> [ Training ]')
  acc1_train, acc5_train = train(trainloader,
                                 epoch=epoch, model=maskedmodel,
                                 criterion=criterion, optimizer=optimizer)
  
  print('===> [ Validation ]')
  acc1_valid, acc5_valid = validate(validloader, maskedmodel, criterion)

  best_acc1 = max(acc1_valid, best_acc1)
  best_acc5 = max(acc5_valid, best_acc5)

Epoch : 0, lr : 0.0001
===> [ Training ]
Epoch:[0][  0/196]	Time  0.095 ( 0.095)	Data  0.087 ( 0.087)	Loss 2.3546e+00 (2.3546e+00)	Acc@1  12.11 ( 12.11)	Acc@5  50.39 ( 50.39)
Epoch:[0][100/196]	Time  7.786 ( 3.964)	Data  7.779 ( 3.957)	Loss 1.9269e+00 (2.0486e+00)	Acc@1  34.38 ( 26.12)	Acc@5  80.47 ( 75.05)
====> Acc@1 30.594 Acc@5 79.786
===> [ Validation ]
Test: [ 0/40]	Time  0.043 ( 0.043)	Loss 1.6702e+00 (1.6702e+00)	Acc@1  42.19 ( 42.19)	Acc@5  90.23 ( 90.23)
====> Acc@1 40.350 Acc@5 87.960
Epoch : 1, lr : 0.0001
===> [ Training ]
Epoch:[1][  0/196]	Time  0.091 ( 0.091)	Data  0.084 ( 0.084)	Loss 1.8217e+00 (1.8217e+00)	Acc@1  35.55 ( 35.55)	Acc@5  85.55 ( 85.55)
Epoch:[1][100/196]	Time  7.783 ( 3.935)	Data  7.776 ( 3.928)	Loss 1.7018e+00 (1.6919e+00)	Acc@1  39.06 ( 39.38)	Acc@5  88.28 ( 87.75)
====> Acc@1 40.294 Acc@5 88.490
===> [ Validation ]
Test: [ 0/40]	Time  0.046 ( 0.046)	Loss 1.5824e+00 (1.5824e+00)	Acc@1  44.92 ( 44.92)	Acc@5  91.80 ( 91.80)
====> Acc@1 45.750 Acc@5 91.35

In [None]:
### Find Threshold
pruning_ratio = 0.2 # We can modify

weights = np.array([])
for name, param in maskedmodel.named_parameters():
  if len(param.size()) == 4 and 'mask' not in name: #only conv weight
    weights = np.append(weights, param.data.view(-1).abs().cpu().numpy()) # reshape to 1 dimension, absolute value of param, use CPU tensor, change to numpy

threshold = np.sort(weights)[int(weights.shape[0]* pruning_ratio)] # the number of weights array
print(f'Threshold : {threshold} with pruning ratio : {pruning_ratio}')

Threshold : 0.01622624695301056 with pruning ratio : 0.2


fine-tuning

In [None]:
### Additional Training
lr = 1e-4
best_acc1 = 0
best_acc5 = 0

for epoch in range(2):
  adjust_learning_rate(optimizer, epoch, lr)

  print("Epoch : {}, lr : {}".format(epoch, optimizer.param_groups[0]['lr']))
  print('===> [ Training ]')
  acc1_train, acc5_train = train(trainloader,
                                 epoch=epoch, model=maskedmodel,
                                 criterion=criterion, optimizer=optimizer)
  
  
  print('===> [ Validation ]')
  acc1_valid, acc5_valid = validate(validloader, maskedmodel, criterion)

  best_acc1 = max(acc1_valid, best_acc1)
  best_acc5 = max(acc5_valid, best_acc5)

Epoch : 0, lr : 0.0001
===> [ Training ]
Epoch:[0][  0/196]	Time  0.082 ( 0.082)	Data  0.075 ( 0.075)	Loss 1.4124e+00 (1.4124e+00)	Acc@1  48.44 ( 48.44)	Acc@5  94.14 ( 94.14)
Epoch:[0][100/196]	Time  7.795 ( 3.954)	Data  7.788 ( 3.947)	Loss 1.4283e+00 (1.4033e+00)	Acc@1  48.05 ( 49.97)	Acc@5  90.23 ( 92.62)
====> Acc@1 49.782 Acc@5 92.532
===> [ Validation ]
Test: [ 0/40]	Time  0.046 ( 0.046)	Loss 1.3688e+00 (1.3688e+00)	Acc@1  51.17 ( 51.17)	Acc@5  93.75 ( 93.75)
====> Acc@1 53.810 Acc@5 94.500
Epoch : 1, lr : 0.0001
===> [ Training ]
Epoch:[1][  0/196]	Time  0.077 ( 0.077)	Data  0.070 ( 0.070)	Loss 1.2666e+00 (1.2666e+00)	Acc@1  55.47 ( 55.47)	Acc@5  94.92 ( 94.92)
Epoch:[1][100/196]	Time  7.795 ( 3.957)	Data  7.788 ( 3.950)	Loss 1.3822e+00 (1.3825e+00)	Acc@1  48.05 ( 50.63)	Acc@5  91.80 ( 93.04)
====> Acc@1 50.976 Acc@5 93.200
===> [ Validation ]
Test: [ 0/40]	Time  0.045 ( 0.045)	Loss 1.2629e+00 (1.2629e+00)	Acc@1  56.25 ( 56.25)	Acc@5  94.53 ( 94.53)
====> Acc@1 54.430 Acc@5 94.70

In [None]:
model = ToyNet().cuda()
optimizer = optim.Adam(model.parameters()) # list of all parameter except mask tensor
criterion = nn.CrossEntropyLoss().cuda()

기존 모델과 성능비교

In [None]:
lr = 1e-4
best_acc1 = 0
best_acc5 = 0

for epoch in range(7):
  adjust_learning_rate(optimizer, epoch, lr)

  print("Epoch : {}, lr : {}".format(epoch, optimizer.param_groups[0]['lr']))
  print('===> [ Training ]')
  acc1_train, acc5_train = train(trainloader,
                                 epoch=epoch, model=model,
                                 criterion=criterion, optimizer=optimizer)
  
  print('===> [ Validation ]')
  acc1_valid, acc5_valid = validate(validloader, model, criterion)

  best_acc1 = max(acc1_valid, best_acc1)
  best_acc5 = max(acc5_valid, best_acc5)

Epoch : 0, lr : 0.0001
===> [ Training ]
Epoch:[0][  0/196]	Time  0.101 ( 0.101)	Data  0.094 ( 0.094)	Loss 2.3312e+00 (2.3312e+00)	Acc@1  16.41 ( 16.41)	Acc@5  51.95 ( 51.95)
Epoch:[0][100/196]	Time  8.935 ( 4.104)	Data  8.928 ( 4.096)	Loss 1.7952e+00 (1.9998e+00)	Acc@1  34.38 ( 27.39)	Acc@5  88.67 ( 77.74)
====> Acc@1 32.100 Acc@5 81.758
===> [ Validation ]
Test: [ 0/40]	Time  0.047 ( 0.047)	Loss 1.5969e+00 (1.5969e+00)	Acc@1  41.02 ( 41.02)	Acc@5  89.45 ( 89.45)
====> Acc@1 42.460 Acc@5 88.710
Epoch : 1, lr : 0.0001
===> [ Training ]
Epoch:[1][  0/196]	Time  0.079 ( 0.079)	Data  0.072 ( 0.072)	Loss 1.6972e+00 (1.6972e+00)	Acc@1  39.06 ( 39.06)	Acc@5  85.16 ( 85.16)
Epoch:[1][100/196]	Time  7.921 ( 4.010)	Data  7.914 ( 4.003)	Loss 1.6133e+00 (1.6621e+00)	Acc@1  46.48 ( 40.59)	Acc@5  88.28 ( 88.24)
====> Acc@1 41.668 Acc@5 88.946
===> [ Validation ]
Test: [ 0/40]	Time  0.044 ( 0.044)	Loss 1.5534e+00 (1.5534e+00)	Acc@1  41.02 ( 41.02)	Acc@5  89.45 ( 89.45)
====> Acc@1 45.150 Acc@5 91.32

In [None]:
# 파라미터 개수 : torch.count_nonzero