In [1]:
import argparse
import os
import time
import shutil

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn

import torchvision
import torchvision.transforms as transforms

from models.quant_layer import *
from models.VGG16_custom import *

In [2]:
use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu") 
use_gpu, torch.cuda.get_device_name()

(True, 'NVIDIA GeForce GTX 1080 Ti')

In [3]:
batch_size = 256
model_name = "VGG16_custom1"
model = VGG16_custom()

In [4]:
fdir = 'result/'+str(model_name)+'/model_best.pth.tar'
checkpoint = torch.load(fdir)
model.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [5]:
# means and stds for individual RGB channels
# image = (image - mean) / std
normalize = transforms.Normalize(mean=[0.491, 0.482, 0.447], std=[0.247, 0.243, 0.262])

train_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=True,
    download=True,
    transform=transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ]))

test_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=transforms.Compose([
        transforms.ToTensor(),
        normalize,
    ]))

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified


In [6]:
print_freq = len(testloader) / 4
print(print_freq)

10.0


In [7]:
def train(trainloader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    model.train()

    end = time.time()
    for i, (input, target) in enumerate(trainloader):
        # measure data loading time
        data_time.update(time.time() - end)

        input, target = input.cuda(), target.cuda()

        # compute output
        output = model(input)
        loss = criterion(output, target)

        # measure accuracy and record loss
        prec = accuracy(output, target)[0]
        losses.update(loss.item(), input.size(0))
        top1.update(prec.item(), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()


        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec {top1.val:.3f}% ({top1.avg:.3f}%)'.format(
                   epoch, i, len(trainloader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1))
            
def validate(val_loader, model, criterion ):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    with torch.no_grad():
        for i, (input, target) in enumerate(val_loader):
         
            input, target = input.cuda(), target.cuda()

            # compute output
            output = model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            prec = accuracy(output, target)[0]
            losses.update(loss.item(), input.size(0))
            top1.update(prec.item(), input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % print_freq == 0:  # This line shows how frequently print out the status. e.g., i%5 => every 5 batch, prints out
                print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec {top1.val:.3f}% ({top1.avg:.3f}%)'.format(
                   i, len(val_loader), batch_time=batch_time, loss=losses,
                   top1=top1))

    print(' * Prec {top1.avg:.3f}% '.format(top1=top1))
    return top1.avg

def save_checkpoint(state, is_best, fdir):
    filepath = os.path.join(fdir, 'checkpoint.pth')
    torch.save(state, filepath)
    if is_best:
        shutil.copyfile(filepath, os.path.join(fdir, 'model_best.pth.tar'))

In [8]:
def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True) # topk(k, dim=None, largest=True, sorted=True)
                                               # will output (max value, its index)
    pred = pred.t()           # transpose
    correct = pred.eq(target.view(1, -1).expand_as(pred))   # "-1": calculate automatically

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)  # view(-1): make a flattened 1D tensor
        res.append(correct_k.mul_(100.0 / batch_size))   # correct: size of [maxk, batch_size]
    return res


In [9]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n    ## n is impact factor
        self.count += n
        self.avg = self.sum / self.count

In [10]:
model

VGG_quant(
  (features): Sequential(
    (0): QuantConv2d(
      3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
      (weight_quant): weight_quantize_fn()
    )
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): QuantConv2d(
      64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
      (weight_quant): weight_quantize_fn()
    )
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): QuantConv2d(
      64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
      (weight_quant): weight_quantize_fn()
    )
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): QuantConv2d(
      128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 

In [11]:
criterion = nn.CrossEntropyLoss().cuda()

model.eval()
model.cuda()

test_loss = 0
correct = 0

with torch.no_grad():
    for data, target in testloader:
        data, target = data.to(device), target.to(device) # loading to GPU
        output = model(data)
        pred = output.argmax(dim=1, keepdim=True)  
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(testloader.dataset)

print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(testloader.dataset),
        100. * correct / len(testloader.dataset)))



Test set: Accuracy: 9157/10000 (92%)



In [12]:
#### Prune all the QuantConv2D layers' 80% weights with structured pruning.
import torch.nn.utils.prune as prune

for name, layer in model.named_modules():
    if isinstance(layer, QuantConv2d):  # Check if the layer is a QuantConv2d layerprune.ln_structured(layer, name='weight', amount=0.8, dim=0, n=1)  # Apply pruning
        
        prune.ln_structured(layer, name='weight', amount=0.8, dim=0, n=1)  # Apply pruning    #n=1, use L1 norm, n=2, use L2 norm
        # prune.ln_structured(layer, name='weight', amount=0.5, dim=0, n=1)
        # prune.ln_structured(layer, name='weight', amount=0.3, dim=0, n=1)
        
        print(f"Pruned QuantConv2d layer: {name}")


Pruned QuantConv2d layer: features.0
Pruned QuantConv2d layer: features.3
Pruned QuantConv2d layer: features.7
Pruned QuantConv2d layer: features.10
Pruned QuantConv2d layer: features.14
Pruned QuantConv2d layer: features.17
Pruned QuantConv2d layer: features.20
Pruned QuantConv2d layer: features.24
Pruned QuantConv2d layer: features.27
Pruned QuantConv2d layer: features.30
Pruned QuantConv2d layer: features.34
Pruned QuantConv2d layer: features.37
Pruned QuantConv2d layer: features.39


In [None]:
# Unstructured Pruning executed here, use either one here, dont use both pruning methods

import torch.nn.utils.prune as prune

for name, layer in model.named_modules():
    if isinstance(layer, QuantConv2d):  # Check if the layer is a QuantConv2d layerprune.ln_structured(layer, name='weight', amount=0.8, dim=0, n=1)  # Apply pruning
       
        # prune.l1_unstructured(layer, name='weight', amount=0.8)
        # prune.l1_unstructured(layer, name='weight', amount=0.5)
        # prune.l1_unstructured(layer, name='weight', amount=0.3)
        
        print(f"Pruned QuantConv2d layer: {name}")

In [13]:
### Check sparsity ###
mask = model.features[30].weight_mask
sparsity_mask = (mask == 0).sum() / mask.nelement()

# weight = model.features[30].weight
# sparsity_mask = (weight == 0).sum() / weight.nelement()

print("Sparsity level: ", sparsity_mask)

Sparsity level:  tensor(0.8008, device='cuda:0')


In [None]:
## check accuracy after pruning, but before finetuning 

model.cuda()
model.eval()

test_loss = 0
correct = 0

with torch.no_grad():
    for data, target in testloader:
        data, target = data.to(device), target.to(device) # loading to GPU
        output = model(data)
        pred = output.argmax(dim=1, keepdim=True)  
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(testloader.dataset)

print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(testloader.dataset),
        100. * correct / len(testloader.dataset)))


In [14]:
# 1. Version of Training, does not save the checkpoint

model.cuda()
n_epochs = 50
lr = 0.05
weight_decay = 1e-4
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
# per epoch, all the training data set is used once
model.train() # prep model for training


for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    
    for data, target in trainloader:
        data, target = data.to(device), target.to(device) # loading to GPU
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*data.size(0) # as loss is tensor, .item() needed to get the value
        
    # print training statistics, calculate average loss over an epoch
    train_loss = train_loss/len(trainloader.dataset)

    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))

Epoch: 1 	Training Loss: 2.140607
Epoch: 2 	Training Loss: 1.688480
Epoch: 3 	Training Loss: 1.371289
Epoch: 4 	Training Loss: 1.154713
Epoch: 5 	Training Loss: 1.023836
Epoch: 6 	Training Loss: 0.947824
Epoch: 7 	Training Loss: 0.891423
Epoch: 8 	Training Loss: 0.857185
Epoch: 9 	Training Loss: 0.837516
Epoch: 10 	Training Loss: 0.810638
Epoch: 11 	Training Loss: 0.790641
Epoch: 12 	Training Loss: 0.772068
Epoch: 13 	Training Loss: 0.755202
Epoch: 14 	Training Loss: 0.745211
Epoch: 15 	Training Loss: 0.728888
Epoch: 16 	Training Loss: 0.726871
Epoch: 17 	Training Loss: 0.715367
Epoch: 18 	Training Loss: 0.706288
Epoch: 19 	Training Loss: 0.703068
Epoch: 20 	Training Loss: 0.690103
Epoch: 21 	Training Loss: 0.685171
Epoch: 22 	Training Loss: 0.685071
Epoch: 23 	Training Loss: 0.680264
Epoch: 24 	Training Loss: 0.677284
Epoch: 25 	Training Loss: 0.675615
Epoch: 26 	Training Loss: 0.667237
Epoch: 27 	Training Loss: 0.665243
Epoch: 28 	Training Loss: 0.662890
Epoch: 29 	Training Loss: 0.6

In [35]:
## check your accuracy again after finetuning

model.eval()

test_loss = 0
correct = 0

with torch.no_grad():
    for data, target in testloader:
        data, target = data.to(device), target.to(device) # loading to GPU
        output = model(data)
        pred = output.argmax(dim=1, keepdim=True)  
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(testloader.dataset)

print('\nTest set: Accuracy: {}/{} ({:.2f}%)\n'.format(
        correct, len(testloader.dataset),
        100. * correct / len(testloader.dataset)))

Test set: Accuracy: 8231/10000 (82.31%)


In [16]:
class SaveOutput:
    def __init__(self):
        self.outputs = []
    def __call__(self, module, module_in):
        self.outputs.append(module_in)  # Save the input tensor
    def clear(self):
        self.outputs = []  

save_output = SaveOutput()
device = torch.device("cuda" if use_gpu else "cpu") 
i = 0
count=0

for layer in model.modules():
    i = i+1
    if isinstance(layer, QuantConv2d):
        print(i,"-th layer prehooked")
        layer.register_forward_pre_hook(save_output)    
        count = count +1

dataiter = iter(trainloader)
images, labels = next(dataiter)
images = images.cuda()
out = model(images)

print(count)

3 -th layer prehooked
7 -th layer prehooked
12 -th layer prehooked
16 -th layer prehooked
21 -th layer prehooked
25 -th layer prehooked
29 -th layer prehooked
34 -th layer prehooked
38 -th layer prehooked
42 -th layer prehooked
47 -th layer prehooked
51 -th layer prehooked
54 -th layer prehooked
13


In [17]:
layer_input = save_output.outputs[11][0]
layer_output = save_output.outputs[12][0]
layer_input.size(), layer_output.size()

(torch.Size([256, 8, 2, 2]), torch.Size([256, 8, 2, 2]))

In [18]:
layer_input = layer_input[0]
layer_output = layer_output[0]
layer_input.size(), layer_output.size()

(torch.Size([8, 2, 2]), torch.Size([8, 2, 2]))

In [19]:
# grab data from the 37th layer!!!

layer = model.features[37]
print(layer)

print(layer._parameters.keys())

print(layer.weight_quant._parameters)

QuantConv2d(
  8, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
  (weight_quant): weight_quantize_fn()
)
odict_keys(['bias', 'act_alpha', 'weight_q', 'weight_orig'])
OrderedDict([('wgt_alpha', Parameter containing:
tensor(0.4924, device='cuda:0', requires_grad=True))])


In [20]:
bw = 4
weight_q = layer.weight_q
w_alpha = layer.weight_quant.wgt_alpha
w_delta = w_alpha / (2**(bw-1)-1)
w_int = weight_q / w_delta

print(w_int.shape)
print(w_int)

torch.Size([8, 8, 3, 3])
tensor([[[[-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000]],

         [[-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000]],

         [[-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000]],

         [[-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000]],

         [[-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000]],

         [[-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000]],

         [[-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000]],

         [[-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000],
          [-2.0000, -2.0000, -2.0000]]],


        [[[ 7.0000,  

In [21]:
x = layer_input
x_alpha = model.features[37].act_alpha
x_delta = x_alpha / (2**(bw)-1)

act_quant_fn = act_quantization(bw)
x_q = act_quant_fn(x, x_alpha)

x_int = x_q / x_delta

print(x_int.shape)
print(x_int)

torch.Size([8, 2, 2])
tensor([[[ 0.0000,  0.0000],
         [ 0.0000,  0.0000]],

        [[ 3.0000, 13.0000],
         [ 0.0000,  0.0000]],

        [[ 0.0000,  0.0000],
         [ 0.0000,  0.0000]],

        [[ 0.0000,  0.0000],
         [ 0.0000,  0.0000]],

        [[ 4.0000,  4.0000],
         [ 4.0000,  4.0000]],

        [[ 1.0000,  1.0000],
         [ 1.0000,  1.0000]],

        [[15.0000,  0.0000],
         [ 4.0000,  2.0000]],

        [[ 0.0000,  0.0000],
         [ 0.0000,  0.0000]]], device='cuda:0', grad_fn=<DivBackward0>)


In [22]:
conv_int = torch.nn.Conv2d(in_channels=8, out_channels=8, kernel_size=3, padding=1, bias=False)
conv_int.weight = torch.nn.parameter.Parameter(w_int)
output_int = F.relu(conv_int(x_int))
output_recovered = output_int * w_delta * x_delta  # recover with x_delta and w_delta

print(output_recovered.shape) 
print(layer_output.shape)
print(output_int)

torch.Size([8, 2, 2])
torch.Size([8, 2, 2])
tensor([[[  0.0000,   0.0000],
         [  0.0000,   0.0000]],

        [[  7.0000, 259.0000],
         [147.0000, 287.0000]],

        [[  0.0000,   0.0000],
         [  0.0000,   0.0000]],

        [[ 91.0000,   0.0000],
         [286.9999, 273.0000]],

        [[  0.0000,   0.0000],
         [  0.0000,   0.0000]],

        [[  0.0000,   0.0000],
         [  0.0000,   0.0000]],

        [[  0.0000,   0.0000],
         [  0.0000,   0.0000]],

        [[  0.0000,   0.0000],
         [  0.0000,   0.0000]]], device='cuda:0', grad_fn=<ReluBackward0>)


In [23]:
# calculate the difference between outputs, d should be less than 1e-03
diff = abs(layer_output - output_recovered)
print(diff.mean())

tensor(1.7229e-07, device='cuda:0', grad_fn=<MeanBackward0>)


In [24]:
print(x_int.size())

torch.Size([8, 2, 2])


In [25]:
x_pad = torch.zeros(8, 4, 4).cuda()

x_pad[:, 1:3, 1:3] = x_int.cuda()

X = torch.reshape(x_pad, (x_pad.size(0), -1))

print(X.size())

torch.Size([8, 16])


In [26]:
from pathlib import Path
# Define the folder path
folder_path = Path('./prun_vgg_output/')

# Create the folder if it doesn't exist
folder_path.mkdir(parents=True, exist_ok=True)


In [27]:
### store weights ###

bit_precision = 4
file = open('./prun_vgg_output/activation.txt', 'w') 
file.write('#time0row7[msb-lsb],time0row6[msb-lst],....,time0row0[msb-lst]#\n')
file.write('#time1row7[msb-lsb],time1row6[msb-lst],....,time1row0[msb-lst]#\n')
file.write('#................#\n')

for i in range(X.size(1)):  # time step
    for j in range(X.size(0)): # row #
        X_bin = '{0:04b}'.format(round(X[7-j,i].item()))
        for k in range(bit_precision):
            file.write(X_bin[k])        
        #file.write(' ')  # use this line for visibility with blank between words
    file.write('\n')
file.close() #close file    


In [28]:
print(w_int.size())
W = torch.reshape(w_int, (w_int.size(0), w_int.size(1), -1))
W.size()

torch.Size([8, 8, 3, 3])


torch.Size([8, 8, 9])

In [29]:
### storing weight data ###                     

bit_precision = 4

file = open('./prun_vgg_output/weight.txt', 'w') 
file.write('#col0row7[msb-lsb],col0row6[msb-lsb],....,col0row0[msb-lsb]#\n')
file.write('#col1row7[msb-lsb],col1row6[msb-lsb],....,col1row0[msb-lsb]#\n')
file.write('#................#\n')

for kij in range(9):
    for i in range(W.size(0)):   #col
        for j in range(W.size(1)):    # row  
            if (W[i, 7-j, kij].item()<0):
                W_bin = '{0:04b}'.format(round(W[i,7-j, kij].item() + 2**bit_precision))        #check again if it works for neg numbers
            else:
                W_bin = '{0:04b}'.format(round(W[i,7-j, kij].item()))
            for k in range(bit_precision):
                file.write(W_bin[k])        
                #file.write(' ')  # for visibility with blank between words, you can use
        file.write('\n')
file.close() #close file  

In [30]:
print(output_int.size())
O = torch.reshape(output_int, (output_int.size(0), -1))
print(O.size())

torch.Size([8, 2, 2])
torch.Size([8, 4])


In [31]:
### Store output data ###

bit_precision = 16
file = open('./prun_vgg_output/output.txt', 'w') #write to file
file.write('#time0col7[msb-lsb],time0col6[msb-lsb],....,time0col0[msb-lsb]#\n')
file.write('#time1col7[msb-lsb],time1col6[msb-lsb],....,time1col0[msb-lsb]#\n')
file.write('#................#\n')

for i in range(O.size(1)):  
    for j in range(O.size(0)): 
        if (O[7-j,i].item()<0):
            O_bin = '{0:016b}'.format(round(O[7-j,i].item() + 2**bit_precision))
        else:
            O_bin = '{0:016b}'.format(round(O[7-j,i].item()))
        for k in range(bit_precision):
            file.write(O_bin[k])        
        #file.write(' ')  # for visibility with blank between words, you can use
    file.write('\n')
file.close() #close file    

In [32]:
print(X.size())

torch.Size([8, 16])


In [33]:
psum = torch.zeros(8, 16, 9).cuda()  #initialize an empty psum first with array size, p_nij and kij
print(psum.size())

# calculate psum value
for kij in range(9):    
    for p_nij in range(16):     
        m = nn.Linear(8, 8, bias=False)  # array size matched
        m.weight = torch.nn.Parameter(W[:,:,kij])
        psum[:, p_nij, kij] = m(X[:,p_nij]).cuda()

torch.Size([8, 16, 9])


In [34]:
### Store psum data ###

bit_precision = 16

file = open('./prun_vgg_output/psum.txt', 'w') #write to file
file.write('#time0col7[msb-lsb],time0col6[msb-lsb],....,time0col0[msb-lsb]#\n')
file.write('#time1col7[msb-lsb],time1col6[msb-lsb],....,time1col0[msb-lsb]#\n')
file.write('#................#\n')

for kij in range(9):
    for i in range(psum.size(1)):  # time step
        for j in range(psum.size(0)): # array size
            if (psum[7-j,i, kij].item()<0):
                psum_bin = '{0:016b}'.format(round(psum[7-j,i, kij].item() + 2**bit_precision))
            else:
                psum_bin = '{0:016b}'.format(round(psum[7-j,i, kij].item()))
            for k in range(bit_precision):
                file.write(psum_bin[k])        
            #file.write(' ')  # for visibility with blank between words, you can use
        file.write('\n')
file.close() #close file    