In [62]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import pickle
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm


In [63]:
# DATA PARSING
# If this is running on GPU cluster, no change required 
# Otherwise, download CIFAR-10 dataset from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz and set path
path='cifar-10-batches-py/'
data=np.zeros((0,32,32,3))
labels=[]
for i in range(1,6):
    with open(path+'data_batch_'+str(i), 'rb') as fo:
        dat = pickle.load(fo)
        r = dat['data'][:,:1024*1].reshape((10000,32,32,1))
        g = dat['data'][:,1024:2048].reshape((10000,32,32,1))
        b = dat['data'][:,2048:3072].reshape((10000,32,32,1))
        rgb = np.concatenate((r,g,b),axis=3)
        data = np.vstack((data,np.float32(rgb)/255))
        labels += dat['labels']
labels = np.array(labels)
# data -> 50000 X 32 X 32 X 3 array with training data
# labels -> 50000 labels ranging from 0 to 9

<b>[2 points]</b> Plot 3 random images corresponding to each label from the training data and indicate the name of the class label.

<b>[0 points]</b> Now, we perform some pre-processing operations to get our training datasets.

1. Split the data and labels into 2 sets, first one containing labels 0 to 4, and second one from 5 to 9. 
2. Generate one hot encoded targets based on the labels for the 2 sets.
3. Store them in data1, labels1, data2 and labels2.

In [64]:
data1 = np.zeros((0,32,32,3))
labels1 = []
data2 = np.zeros((0,32,32,3))
labels2 = []
for i in range(5):
    x = data[labels==i]
    data1 = np.vstack((data1,x))
    labels1 += [i]*len(x)
for i in range(5,10):
    x = data[labels==i]
    data2 = np.vstack((data2,x))
    labels2 += [i-5]*len(x)
labels1 = np.array(labels1)
labels2 = np.array(labels2)

temp = np.zeros((len(labels1),5))
for i in range(len(labels1)):
    temp[i,labels1[i]] = 1
labels1 = temp
temp = np.zeros((len(labels2),5))
for i in range(len(labels2)):
    temp[i,labels2[i]] = 1
labels2 = temp

torch_data1 = data1.transpose((0,3,1,2))
torch_data2 = data2.transpose((0,3,1,2))

<b>[3 points]</b> Create a simple convolutional network to classify the training data. The network structure should be as follows:
1. Layer 1 - Kernel size 4, Stride 2, Output channels 5, Bias enabled, Relu activation
2. Layer 2 - Kernel size 4, Stride 1, Output channels 10, Bias enabled, Relu avtication
3. Layer 3 - Kernel size 4, Stride 1, Output channels 20, Bias enabled, Relu activation
4. Layer 4 - Kernel size 4, Stride 1, Output channels 40, Bias enabled, Relu activation
5. Layer 5 - Fully connected layer followed by Sigmoid activation

Refer to https://github.com/ameykusurkar/pytorch-image-classifier/blob/master/main.py for help from this section onwards, but note that torchvision.transforms is not required since we already have the data in the required format.

In [65]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

n=5
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 5, 4, stride=2)
        self.conv2 = nn.Conv2d(5, 10, 4, stride=1)
        self.conv3 = nn.Conv2d(10, 20, 4, stride=1)
        self.conv4 = nn.Conv2d(20, 40, 4, stride=1)
        self.fc = nn.Linear(40*6*6, 5)
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        
        x = x.view(-1, 40*6*6)

        x = F.sigmoid(self.fc(x))

        return x

In [66]:
# print the network structure
# Using a GPU is highly recommended since training will take a while otherwise
net = Net().cuda()
print(net)

Net(
  (conv1): Conv2d(3, 5, kernel_size=(4, 4), stride=(2, 2))
  (conv2): Conv2d(5, 10, kernel_size=(4, 4), stride=(1, 1))
  (conv3): Conv2d(10, 20, kernel_size=(4, 4), stride=(1, 1))
  (conv4): Conv2d(20, 40, kernel_size=(4, 4), stride=(1, 1))
  (fc): Linear(in_features=1440, out_features=5, bias=True)
)


<b>[5 points]</b> Create a function that trains the network using the provided data. However it should only train the part of the network that is passed in as a parameter.
1. Training data must be randomly sampled to obtain a batch of data which is a subset of the whole training dataset at every iteration.
2. Use the Adam optimizer and BCELoss function (Binary Cross Entropy Loss).
3. Store the loss as well as the accuracy of the network on the training data at every iteration and return them in arrays at the end.

In [67]:
# to_train can be net.paramaters OR net.fc.parameters OR net.conv1.parameters so that only certain parts of the net are trained
def train(tdata,tlabel,to_train):
    criterion = nn.CrossEntropyLoss()
    losslist = []
    acc = []
    epochs = 100
    batch = 300
    learning_rate = 0.00001
    optimizer = optim.Adam(net.parameters(), lr = learning_rate)
    for k in tqdm(range(epochs)):
        for l in range(int(len(tdata)/batch)):
            inds = np.random.randint(0,len(tdata)-1,batch)
            inputs = Variable(torch.FloatTensor(tdata[inds]).cuda())
            targets = Variable(torch.FloatTensor(tlabel[inds]).cuda())
            optimizer.zero_grad()
            #inputs = inputs.view(-1, 921600)
            prediction = net(inputs)
            loss = criterion(prediction,targets)
            loss.backward()
            optimizer.step()
            losslist.append(loss.data.cpu().numpy())
            acc.append(np.mean(np.argmax(prediction.data.cpu().numpy(),1)==np.argmax(tlabel[inds],1)))

    return losslist,acc

<b>[5 points]</b> Initialize the network, train the complete network (net.parameters) on data1 (the first 5 classes) and plot the loss and accuracy vs iterations on the same graph. Print the final loss and accuracy as well. Set the learning rate, number of iterations and batch size such that the loss is gradually and smoothly decreasing and converging. The accuracy at the end of training must be at least 35 %. Suggested parameters are: batch size greater than 300, learning rate in the order of 1e-5 and at least 100 iterations for these parameters.

In [68]:
### Initialize net
net = Net().cuda()
x1,a1 = train(torch_data1, labels1, net.parameters())
ax = range(len(x1))
plt.plot(ax,x1,ax,a1)
plt.show()
print(x1[-1])
print(a1[-1])

  0%|          | 0/100 [00:00<?, ?it/s]


RuntimeError: Expected object of type Variable[torch.cuda.LongTensor] but found type Variable[torch.cuda.FloatTensor] for argument #1 'target'

<b>[2 points]</b> Without reinitializing the network, train only the fully connected layer (net.fc.parameters) now on data2 (the next 5 classes). Do not change any hyper parameters such as learning rate or batch size. Plot the loss and accuracy and print the final values like before.

In [None]:
x2,a2 = train(torch_data2, labels2, net.parameters())
ax = range(len(x2))
plt.plot(ax,x2,ax,a2)
plt.show()
print(x2[-1])
print(a2[-1])

<b>[3 points]</b> Now repeat the process in the opposite order. Initialize the net again, train the whole network on data2, generate the same plots as before, and then without reinitializing the net, train only the fully connected layer on data1 and generate the plots. Do not change any hyperparameters.

In [None]:
# Initialize net
net = Net().cuda()
x3,a3 = train(torch_data3, labels4, net.parameters())
ax = range(len(x3))
plt.plot(ax,x3,ax,a3)
plt.show()
print(x3[-1])
print(a3[-1])

In [None]:
x4,a4 = train(torch_data2, labels2, net.parameters())
ax = range(len(x4))
plt.plot(ax,x4,ax,a4)
plt.show()
print(x4[-1])
print(a4[-1])

<b>[5 points]</b> Plot the loss vs iterations graphs obtained in the previous 4 training operations on the same graph, to visualize the effects of transfer learning. Explain the results obtained, based on the training regimen. Comment on the performance of transfer learning in each setting.

In [None]:
ax = range(len(x1))
plt.plot(ax,x1,ax,x3,ax,x2,ax,x4)
plt.legend(['random init 1','random init 2','transfer learning 1','transfer learning 2'])
plt.show()

<b>[0 points]</b> Create a network with more layers, pooling layers, and more filters and try to increase accuracy as much as possible. Play around with the hyperparameters to understand how they affect the training process. No need to submit anything for this part.