In [3]:
### Import Libraries
import numpy as np
import torch.nn as nn
import torch
from torch.autograd.variable import Variable
import torchvision
from torchvision import datasets as dset
from torchvision import models
from torchvision import transforms
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler
import timeit
import os
from PIL import Image
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [4]:
import copy
import math
gpu_dtype = torch.cuda.FloatTensor

In [5]:
### Retrieve dataset
class ChunkSampler(sampler.Sampler):
    """Samples elements sequentially from some offset. 
    Arguments:
        num_samples: # of desired datapoints
        start: offset where we should start selecting from
    """
    def __init__(self, num_samples, start = 0):
        self.num_samples = num_samples
        self.start = start

    def __iter__(self):
        return iter(range(self.start, self.start + self.num_samples))

    def __len__(self):
        return self.num_samples
    
NUM_TRAIN = 10000
NUM_VAL = 1000
batch_size = 256

# transform = transforms.Compose(
#     [transforms.ToTensor(),
#      transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
# trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
#                                         download=True, transform=transform)
# testset = torchvision.datasets.CIFAR10(root='./data', train=False,
#                                        download=True, transform=transform)

transform = transforms.Compose([transforms.ToTensor(),
  transforms.Normalize((0.5,), (0.5,))
])
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)


# trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
#                                           sampler=ChunkSampler(NUM_TRAIN, 0))
# testloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
#                                           sampler=ChunkSampler(NUM_VAL, 0))

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size)

classes = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')

In [7]:
def evaluation(model, dataloader):
    ### Compute Accuracy
    correct = 0
    total = 0
    for data in dataloader:
        images, labels = data
        images = images.type(gpu_dtype)
        labels = labels.type(gpu_dtype).long()
        outputs = model(Variable(images))
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()
    acc = 100 * correct / total 
    return acc, total

In [8]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square conv kernel
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0)
        self.fc1 = nn.Linear(256, 120)  # 5x5 image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, int(x.nelement() / x.shape[0]))
        # print(x.shape)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

best_model = LeNet().to(device=device)

In [14]:
print(best_model)

LeNet(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=256, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [15]:
num_param = count_parameters(best_model)
print(num_param)

44426


In [12]:
### Training
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(org_model.parameters(), lr=0.001)
cur_acc = 0.0
running_loss = 0.0
epochs = 50
best_model = org_model
for epoch in range(epochs):
    for i, data in enumerate(trainloader, 0): #i is a counter, start from 0, the tuple (i,data) 
                                          #is produced
        # get the inputs
        inputs, labels = data
        inputs_gpu = inputs.type(gpu_dtype)
        labels_gpu = labels.type(gpu_dtype).long()

        # wrap them in Variable
        inputs, labels = Variable(inputs_gpu), Variable(labels_gpu)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = org_model(inputs) # Forward -> score
        loss = criterion(outputs, labels) # Forward -> loss
        loss.backward() # Backward generate gradients
        optimizer.step() # Update Parameters

        # print statistics
        running_loss += loss.item()
        # break
    if (epoch+1) % 10 == 0:
        acc, total_num = evaluation(org_model, trainloader)
        print('epoch: {}, train accuracy: {:.2f}%, loss: {:.5f}'.format(epoch+1, acc, running_loss/total_num))
        if acc > cur_acc:
            best_model = copy.deepcopy(org_model).to(device)
    running_loss = 0

print("Finish Training")

In [1]:
# best_model = torch.load('original_lenet_model-0.3')
# best_model.eval()

In [31]:
acc, _ = evaluation(best_model, testloader)
print(acc)

tensor(98.6800, device='cuda:0')


In [18]:
# torch.save(best_model, 'original_lenet_model-0.3')

In [None]:
# for child in model.children():
#     print(child)

In [None]:
conv_counter = 0
linear_counter = 0
for child in model.children():
    # print(child)
    if "Conv2d" in str(child): #check if it is a conv layer
        conv_counter += 1
    else:
        linear_counter += 1
print('num of conv layer: {}'.format(conv_counter))
print('num of linear layer: {}'.format(linear_counter))

In [32]:
model = copy.deepcopy(best_model).to(device)

In [33]:
acc, _ = evaluation(model, testloader)
print(acc)

tensor(98.6800, device='cuda:0')


In [46]:
cnn_index_dict4d = dict([("conv1", []),("conv2", [])])
cnn_index_dict1d = dict([("conv1", []),("conv2", [])])
linear_index_dict2d = dict([("linear1", []),("linear2", []),("linear3", [])])
linear_index_dict1d = dict([("linear1", []),("linear2", []),("linear3", [])])
conv_counter = 0
linear_counter = 0
magnitude = 0.1
# strategy = 

In [47]:
# Freeze the parameters that absulute value is less than a threshold ( defined magnitude)
# Freeze mean set the weight to 0
pruned_counter = 0
for child in model.children():
        if "Conv2d" in str(child): #check if it is a conv layer
            conv_counter += 1
            #print("total parameters:",len(list(children_of_child.parameters())))
            for param in child.parameters():
                #print(type(param.data[0,0,0,0]))
                # convolutional matrix
                if len(param.data.size()) == 4:
                    #Loop through all the entries
                    for i in range(param.data.size()[0]):
                        for j in range(param.data.size()[1]):
                            for k in range(param.data.size()[2]):
                                for l in range(param.data.size()[3]):
                                    if abs(param.data[i,j,k,l]) < magnitude:
                                        param.data[i,j,k,l] = 0
                                        pruned_counter += 1
                                        index_name = "conv" + str(conv_counter)
                                        cnn_index_dict4d[index_name].append([i,j,k,l])
            else:
                # bias
                for i in range(param.data.size()[0]):
                    if abs(param.data[i]) < magnitude:
                        param.data[i] = 0
                        pruned_counter += 1
                        index_name = "conv" + str(conv_counter)
                        cnn_index_dict1d[index_name].append(i)
        else:
            linear_counter += 1
            for param in child.parameters():
                if len(param.data.size()) == 2:
                    for i in range(param.data.size()[0]):
                        for j in range(param.data.size()[1]):
                            if abs(param.data[i,j]) < magnitude:
                                param.data[i,j] = 0
                                pruned_counter += 1
                                index_name = "linear" + str(linear_counter)
                                linear_index_dict2d[index_name].append([i,j])
                else:
                    for i in range(param.data.size()[0]):
                        if abs(param.data[i]) < magnitude:
                            param.data[i] = 0
                            pruned_counter += 1
                            index_name = "linear" + str(linear_counter)
                            linear_index_dict1d[index_name].append(i)

In [48]:
print('prune counter: {}'.format(pruned_counter))

prune counter: 31657


In [51]:
acc, _ = evaluation(model, testloader)
print('Accuracy of the network on the 10000 test images: %.2f %%' % (acc))

Accuracy of the network on the 10000 test images: 98.96 %


In [52]:
### Add up hook to gradient in order to avoid updating certain weights
def cnn_hook4d1(grad):
    grad_clone = grad.clone()
    for i in range(len(cnn_index_dict4d["conv1"])):
        a,b,c,d = cnn_index_dict4d["conv1"][i]
        grad_clone[a,b,c,d] = 0
    return grad_clone

def cnn_hook4d2(grad):
    grad_clone = grad.clone()
    for i in range(len(cnn_index_dict4d["conv2"])):
        a,b,c,d = cnn_index_dict4d["conv2"][i]
        grad_clone[a,b,c,d] = 0
    return grad_clone

cnn_hook_dict4d = dict([("hook1",cnn_hook4d1),("hook2",cnn_hook4d2)])

def cnn_hook1d1(grad):
    grad_clone = grad.clone()
    for i in range(len(cnn_index_dict1d["conv1"])):
        a = cnn_index_dict1d["conv1"][i]
        grad_clone[a] = 0
    return grad_clone

def cnn_hook1d2(grad):
    grad_clone = grad.clone()
    for i in range(len(cnn_index_dict1d["conv2"])):
        a = cnn_index_dict1d["conv2"][i]
        grad_clone[a] = 0
    return grad_clone

cnn_hook_dict1d = dict([("hook1",cnn_hook1d1),("hook2",cnn_hook1d2)])


def linear_hook2d1(grad):
    grad_clone = grad.clone()
    for i in range(len(linear_index_dict2d["linear1"])):
        a,b = linear_index_dict2d["linear1"][i]
        grad_clone[a,b] = 0
    return grad_clone

def linear_hook2d2(grad):
    grad_clone = grad.clone()
    for i in range(len(linear_index_dict2d["linear2"])):
        a,b = linear_index_dict2d["linear2"][i]
        grad_clone[a,b] = 0
    return grad_clone

def linear_hook2d3(grad):
    grad_clone = grad.clone()
    for i in range(len(linear_index_dict2d["linear3"])):
        a,b = linear_index_dict2d["linear3"][i]
        grad_clone[a,b] = 0
    return grad_clone

linear_hook_dict2d = dict([("hook1",linear_hook2d1),("hook2",linear_hook2d2), ("hook3",linear_hook2d3)])


def linear_hook1d1(grad):
    grad_clone = grad.clone()
    for i in range(len(linear_index_dict1d["linear1"])):
        a = linear_index_dict1d["linear1"][i]
        grad_clone[a] = 0
    return grad_clone

def linear_hook1d2(grad):
    grad_clone = grad.clone()
    for i in range(len(linear_index_dict1d["linear2"])):
        a = linear_index_dict1d["linear2"][i]
        grad_clone[a] = 0
    return grad_clone

def linear_hook1d3(grad):
    grad_clone = grad.clone()
    for i in range(len(linear_index_dict1d["linear3"])):
        a = linear_index_dict1d["linear3"][i]
        grad_clone[a] = 0
    return grad_clone

linear_hook_dict1d = dict([("hook1",linear_hook1d1),("hook2",linear_hook1d2), ("hook3",linear_hook1d3)])

conv_counter = 0
linear_counter = 0
child_counter = 0
for child in model.children():
    children_of_child_counter = 0 # Going thru all layers of the network
    if "Conv2d" in str(child): #check if it is a conv layer
        conv_counter += 1
        for param in child.parameters():
            if len(param.data.size()) == 4:
                hook_name = "hook" + str(conv_counter)
                param.register_hook(cnn_hook_dict4d[hook_name])
            else:
                hook_name = "hook" + str(conv_counter)
                param.register_hook(cnn_hook_dict1d[hook_name])
    else:
        linear_counter += 1
        for param in child.parameters():
            if len(param.data.size()) == 2:
                hook_name = "hook" + str(linear_counter)
                param.register_hook(linear_hook_dict2d[hook_name])
            else:
                hook_name = "hook" + str(linear_counter)
                param.register_hook(linear_hook_dict1d[hook_name])

In [None]:
### Training
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
cur_acc = 0.0
running_loss = 0.0
epochs = 50
best_model = model
for epoch in range(epochs):
    for i, data in enumerate(trainloader, 0): #i is a counter, start from 0, the tuple (i,data) 
                                          #is produced
        # get the inputs
        inputs, labels = data
        inputs_gpu = inputs.type(gpu_dtype)
        labels_gpu = labels.type(gpu_dtype).long()

        # wrap them in Variable
        inputs, labels = Variable(inputs_gpu), Variable(labels_gpu)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs) # Forward -> score
        loss = criterion(outputs, labels) # Forward -> loss
        loss.backward() # Backward generate gradients
        optimizer.step() # Update Parameters

        # print statistics
        running_loss += loss.item()
        # break
    if (epoch+1) % 10 == 0:
        acc, total_num = evaluation(model, trainloader)
        print('epoch: {}, train accuracy: {:.2f}%, loss: {:.5f}'.format(epoch+1, acc, running_loss/total_num))
    running_loss = 0

    if acc > cur_acc:
        best_model = copy.deepcopy(model).type(gpu_dtype)
print("Finish Training")

In [54]:
acc, _ = evaluation(model, testloader)
print('Accuracy of the network on the 10000 test images: %.2f %%' % (acc))

Accuracy of the network on the 10000 test images: 99.04 %


In [55]:
pruned_counter = 0
for child in model.children():
    if "Conv2d" in str(child): #check if it is a conv layer
        for param in child.parameters():
            if len(param.data.size()) == 4:
                #Loop through all the entries
                for i in range(param.data.size()[0]):
                    for j in range(param.data.size()[1]):
                        for k in range(param.data.size()[2]):
                            for l in range(param.data.size()[3]):
                                if param.data[i,j,k,l] == 0:
                                    pruned_counter += 1
            else:
                for i in range(param.data.size()[0]):
                    if param.data[i] == 0:
                        pruned_counter += 1

    else:
        for param in child.parameters():
                if len(param.data.size()) == 2:
                    for i in range(param.data.size()[0]):
                        for j in range(param.data.size()[1]):
                            if param.data[i,j] == 0:
                                pruned_counter += 1
                else:
                    for i in range(param.data.size()[0]):
                        if param.data[i] == 0:
                            param.data[i] = 0
                            pruned_counter += 1
                            
print("we pruned a total of",pruned_counter,"weights")

we pruned a total of 31657 weights
