In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from random import randint
import utils
import time

In [2]:
#device= torch.device("cuda")
device= torch.device("cpu")
print(device)

cpu


In [3]:
from utils import check_cifar_dataset_exists
data_path=check_cifar_dataset_exists()

train_data=torch.load(data_path+'cifar/train_data.pt')
train_label=torch.load(data_path+'cifar/train_label.pt')
test_data=torch.load(data_path+'cifar/test_data.pt')
test_label=torch.load(data_path+'cifar/test_label.pt')

print(train_data.size())
print(test_data.size())

torch.Size([50000, 3, 32, 32])
torch.Size([10000, 3, 32, 32])


In [25]:
class resnet_50(nn.Module):

    def __init__(self):
        
        super(resnet_50, self).__init__()
        
        self.padding = nn.ZeroPad2d((96,96,96,96))
            
        # Assuming starting image to be 3x224x224 in the comments below
        #block-0
        #3 x 224 x 224 --> 64 x 112 x 112 
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=0 )
        
        #64 x 112 x 112 --> 64 x 56 x 56, kernel size = 3, stride = 2
        self.pool1  = nn.MaxPool2d(kernel_size=3, stride=2, padding=0 )
        
        
        #block-1
        #64 x 56 x 56 --> 64 x 56 x 56 
        self.conv2_x1 = nn.Conv2d(64, 64, kernel_size=1, stride=1, padding=0 )
        
        #64 x 56 x 56 --> 64 x 56 x 56 
        self.conv2_x2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0 )
        
        #64 x 56 x 56 --> 256 x 56 x 56 
        self.conv2_x3 = nn.Conv2d(64, 256, kernel_size=1, stride=1, padding=0 )
        
        #256 x 56 x 56 --> 64 x 56 x 56 
        self.conv2_x4 = nn.Conv2d(256, 64, kernel_size=1, stride=1, padding=0 )
        
        #64 x 56 x 56 --> 64 x 56 x 56 
        self.conv2_x5 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0 )
        
        #64 x 56 x 56 --> 256 x 56 x 56 
        self.conv2_x6 = nn.Conv2d(64, 256, kernel_size=1, stride=1, padding=0 )
        
        #256 x 56 x 56 --> 64 x 56 x 56 
        self.conv2_x7 = nn.Conv2d(256, 64, kernel_size=1, stride=1, padding=0 )
        
        #64 x 56 x 56 --> 64 x 56 x 56 
        self.conv2_x8 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0 )
        
        #64 x 56 x 56 --> 256 x 56 x 56 
        self.conv2_x9 = nn.Conv2d(64, 256, kernel_size=1, stride=1, padding=0 )
        
        
        
        
        #block-2
        #256 x 56 x 56 --> 128 x 28 x 28, stride = 2 
        self.conv3_x10 = nn.Conv2d(256, 128, kernel_size=1, stride=2, padding=0 )
        
        #128 x 28 x 28 --> 128 x 28 x 28 
        self.conv3_x11 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=0 )
        
        #128 x 56 x 56 --> 512 x 28 x 28 
        self.conv3_x12 = nn.Conv2d(128, 512, kernel_size=1, stride=1, padding=0 )
        
        #512 x 28 x 28 --> 128 x 28 x 28 
        self.conv3_x13 = nn.Conv2d(512, 128, kernel_size=1, stride=1, padding=0 )
        
        #128 x 28 x 28 --> 128 x 28 x 28 
        self.conv3_x14 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=0 )
        
        #128 x 56 x 56 --> 512 x 28 x 28 
        self.conv3_x15 = nn.Conv2d(128, 512, kernel_size=1, stride=1, padding=0 )
        
        #512 x 28 x 28 --> 128 x 28 x 28 
        self.conv3_x16 = nn.Conv2d(512, 128, kernel_size=1, stride=1, padding=0 )
        
        #128 x 28 x 28 --> 128 x 28 x 28 
        self.conv3_x17 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=0 )
        
        #128 x 56 x 56 --> 512 x 28 x 28 
        self.conv3_x18 = nn.Conv2d(128, 512, kernel_size=1, stride=1, padding=0 )
        
        #512 x 28 x 28 --> 128 x 28 x 28 
        self.conv3_x19 = nn.Conv2d(512, 128, kernel_size=1, stride=1, padding=0 )
        
        #128 x 28 x 28 --> 128 x 28 x 28 
        self.conv3_x20 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=0 )
        
        #128 x 28 x 28 --> 512 x 28 x 28 
        self.conv3_x21 = nn.Conv2d(128, 512, kernel_size=1, stride=1, padding=0 )
        
        
        
        #block-3
        #512 x 28 x 28 --> 256 x 14 x 14, stride = 2 
        self.conv4_x22 = nn.Conv2d(512, 256, kernel_size=1, stride=2, padding=0 )
        
        #512 x 14 x 14 --> 256 x 14 x 14 
        self.conv4_x23 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=0 )
        
        #256 x 14 x 14 --> 1024 x 14 x 14 
        self.conv4_x24 = nn.Conv2d(256, 1024, kernel_size=1, stride=1, padding=0 )
        
        #1024 x 28 x 28 --> 256 x 14 x 14 
        self.conv4_x25 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0 )
        
        #512 x 14 x 14 --> 256 x 14 x 14 
        self.conv4_x26 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=0 )
        
        #256 x 14 x 14 --> 1024 x 14 x 14 
        self.conv4_x27 = nn.Conv2d(256, 1024, kernel_size=1, stride=1, padding=0 )
        
        #1024 x 28 x 28 --> 256 x 14 x 14 
        self.conv4_x28 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0 )
        
        #512 x 14 x 14 --> 256 x 14 x 14 
        self.conv4_x29 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=0 )
        
        #256 x 14 x 14 --> 1024 x 14 x 14 
        self.conv4_x30 = nn.Conv2d(256, 1024, kernel_size=1, stride=1, padding=0 )
        
        #1024 x 28 x 28 --> 256 x 14 x 14 
        self.conv4_x31 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0 )
        
        #512 x 14 x 14 --> 256 x 14 x 14 
        self.conv4_x32 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=0 )
        
        #256 x 14 x 14 --> 1024 x 14 x 14 
        self.conv4_x33 = nn.Conv2d(256, 1024, kernel_size=1, stride=1, padding=0 )
        
        #1024 x 28 x 28 --> 256 x 14 x 14 
        self.conv4_x34 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0 )
        
        #512 x 14 x 14 --> 256 x 14 x 14 
        self.conv4_x35 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=0 )
        
        #256 x 14 x 14 --> 1024 x 14 x 14 
        self.conv4_x36 = nn.Conv2d(256, 1024, kernel_size=1, stride=1, padding=0 )
        
        #1024 x 28 x 28 --> 256 x 14 x 14 
        self.conv4_x37 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0 )
        
        #512 x 14 x 14 --> 256 x 14 x 14 
        self.conv4_x38 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=0 )
        
        #256 x 14 x 14 --> 1024 x 14 x 14 
        self.conv4_x39 = nn.Conv2d(256, 1024, kernel_size=1, stride=1, padding=0 )
        
       
    
        #block-4
        #1024 x 14 x 14 --> 512 x 7 x 7, stride = 2 
        self.conv5_x40 = nn.Conv2d(1024, 512, kernel_size=1, stride=2, padding=0 )
        
        #512 x 7 x 7 --> 512 x 7 x 7 
        self.conv5_x41 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=0 )
        
        #512 x 7 x 7 --> 2048 x 7 x 7 
        self.conv5_x42 = nn.Conv2d(512, 2048, kernel_size=1, stride=1, padding=0 )
        
        #2048 x 7 x 7 --> 512 x 7 x 7 
        self.conv5_x43 = nn.Conv2d(2048, 512, kernel_size=1, stride=1, padding=0 )
        
        #512 x 7 x 7 --> 512 x 7 x 7 
        self.conv5_x44 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=0 )
        
        #512 x 7 x 7 --> 2048 x 7 x 7 
        self.conv5_x45 = nn.Conv2d(512, 2048, kernel_size=1, stride=1, padding=0 )
        
        #2048 x 7 x 7 --> 512 x 7 x 7 
        self.conv5_x46 = nn.Conv2d(2048, 512, kernel_size=1, stride=1, padding=0 )
        
        #512 x 7 x 7 --> 512 x 7 x 7 
        self.conv5_x47 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=0 )
        
        #512 x 7 x 7 --> 2048 x 7 x 7 
        self.conv5_x48 = nn.Conv2d(512, 2048, kernel_size=1, stride=1, padding=0 )
        
        
        #Average Pooling
        #2048 x 7 x 7 --> 2048 x 1 x 1, kernel size = 7
        self.pool2  = nn.AvgPool2d(kernel_size=7, stride=1, padding=0 )
        
        #Fully Connected Layer
        self.fcc_linear = nn.Linear(2048, 1000)
        
    
    def forward(self,x):
        
        #block-0
        y1 = self.conv1(x)
        y1 = F.relu(y1)
        y2 = self.pool1(y1)
        
        #block-1
        y3 = self.conv2_x1(y2)
        y3 = F.relu(y3)
        y4 = self.conv2_x2(y3)
        y4 = F.relu(y4)
        y5 = self.conv2_x3(y4)
        y5 = F.relu(y5)
        y6 = self.conv2_x4(y5)
        y6 = F.relu(y6)
        y7 = self.conv2_x5(y6)
        y7 = F.relu(y7)
        y8 = self.conv2_x6(y7)
        y8 = F.relu(y8)
        y9 = self.conv2_x7(y8)
        y9 = F.relu(y9)
        y10 = self.conv2_x8(y9)
        y10 = F.relu(y10)
        y11 = self.conv2_x9(y10)
        y11 = F.relu(y11)
        
        #block-2
        y12 = self.conv3_x10(y11)
        y12 = F.relu(y12)
        y13 = self.conv3_x11(y12)
        y13 = F.relu(y13)
        y14 = self.conv3_x12(y13)
        y14 = F.relu(y14)
        y15 = self.conv3_x13(y14)
        y15 = F.relu(y15)
        y16 = self.conv3_x14(y15)
        y16 = F.relu(y16)
        y17 = self.conv3_x15(y16)
        y17 = F.relu(y17)
        y18 = self.conv3_x16(y17)
        y18 = F.relu(y18)
        y19 = self.conv3_x17(y18)
        y19 = F.relu(y19)
        y20 = self.conv3_x18(y19)
        y20 = F.relu(y20)
        y21 = self.conv3_x19(y20)
        y21 = F.relu(y21)
        y22 = self.conv3_x20(y21)
        y22 = F.relu(y22)
        y23 = self.conv3_x21(y22)
        y23 = F.relu(y23)
        
        #block-3
        y24 = self.conv4_x22(y23)
        y24 = F.relu(y24)
        y25 = self.conv4_x23(y24)
        y25 = F.relu(y25)
        y26 = self.conv4_x24(y25)
        y26 = F.relu(y26)
        y27 = self.conv4_x25(y26)
        y27 = F.relu(y27)
        y28 = self.conv4_x26(y27)
        y28 = F.relu(y28)
        y29 = self.conv4_x27(y28)
        y29 = F.relu(y29)
        y30 = self.conv4_x28(y29)
        y30 = F.relu(y30)
        y31 = self.conv4_x29(y30)
        y31 = F.relu(y31)
        y32 = self.conv4_x30(y31)
        y32 = F.relu(y32)
        y33 = self.conv4_x31(y32)
        y33 = F.relu(y33)
        y34 = self.conv4_x32(y33)
        y34 = F.relu(y34)
        y35 = self.conv4_x33(y34)
        y35 = F.relu(y35)
        y36 = self.conv4_x34(y35)
        y36 = F.relu(y36)
        y37 = self.conv4_x35(y36)
        y37 = F.relu(y37)
        y38 = self.conv4_x36(y37)
        y38 = F.relu(y38)
        y39 = self.conv4_x37(y38)
        y39 = F.relu(y39)
        y40 = self.conv4_x38(y39)
        y40 = F.relu(y40)
        y41 = self.conv4_x39(y40)
        y41 = F.relu(y41)
        
        
        #block-4
        y42 = self.conv5_x40(y41)
        y42 = F.relu(y42)
        y43 = self.conv5_x41(y42)
        y43 = F.relu(y43)
        y44 = self.conv5_x42(y43)
        y44 = F.relu(y44)
        y45 = self.conv5_x43(y44)
        y45 = F.relu(y45)
        y46 = self.conv5_x44(y45)
        y46 = F.relu(y46)
        y47 = self.conv5_x45(y46)
        y47 = F.relu(y47)
        y48 = self.conv5_x46(y47)
        y48 = F.relu(y48)
        y49 = self.conv5_x47(y48)
        y49 = F.relu(y49)
        y50 = self.conv5_x48(y49)
        y50 = F.relu(y50) 
        y51 = self.pool2(y50)

        y52 = self.fcc_linear(y51)
        
        y53 = F.softmax(y52, dim=1)
    
        return y53

In [15]:
net = resnet_50()

In [16]:
print(net)

resnet_50(
  (padding): ZeroPad2d(padding=(96, 96, 96, 96), value=0)
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2))
  (pool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2_x1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
  (conv2_x2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv2_x3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1))
  (conv2_x4): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1))
  (conv2_x5): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv2_x6): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1))
  (conv2_x7): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1))
  (conv2_x8): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv2_x9): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1))
  (conv3_x10): Conv2d(256, 128, kernel_size=(1, 1), stride=(2, 2))
  (conv3_x11): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1))
  (conv3_x12): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1))
  (

In [17]:
utils.display_num_param(net)

There are 22757736 (22.76 million) parameters in this neural network


In [18]:
net = net.to(device)

In [22]:
criterion = nn.CrossEntropyLoss()
my_lr=0.045 
bs= 50

In [23]:
def eval_on_test_set():

    running_error=0
    num_batches=0

    for i in range(0,10000,bs):

        minibatch_data =  test_data[i:i+bs]
        minibatch_label= test_label[i:i+bs]

        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        inputs = minibatch_data

        scores=net( inputs ) 

        error = utils.get_error( scores , minibatch_label)

        running_error += error.item()

        num_batches+=1

    total_error = running_error/num_batches
    print( 'error rate on test set =', total_error*100 ,'percent')

In [24]:
start=time.time()

for epoch in range(1,20):
    
    # divide the learning rate by 2.5 after every 2 epochs
    if (epoch%2 == 0):
        my_lr = my_lr / 2.5
    
    # create a new optimizer at the beginning of each epoch: give the current learning rate.   
    optimizer=torch.optim.Adam( net.parameters() , lr=my_lr )
        
    # set the running quatities to zero at the beginning of the epoch
    running_loss=0
    running_error=0
    num_batches=0
    
    # set the order in which to visit the image from the training set
    shuffled_indices=torch.randperm(50000)
 
    for count in range(0,50000,bs):
    
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch       
        indices=shuffled_indices[count:count+bs]
        minibatch_data =  train_data[indices]
        minibatch_label=  train_label[indices]
        
        # send them to the gpu
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        # normalize the minibatch (this is the only difference compared to before!)
        #inputs = (minibatch_data - mean)/std
        
        inputs = minibatch_data
        
        # tell Pytorch to start tracking all operations that will be done on "inputs"
        inputs.requires_grad_()

        # forward the minibatch through the net 
        scores=net( inputs ) 

        # Compute the average of the losses of the data points in the minibatch
        loss =  criterion( scores , minibatch_label) 
        
        # backward pass to compute dL/dU, dL/dV and dL/dW   
        loss.backward()

        # do one step of stochastic gradient descent: U=U-lr(dL/dU), V=V-lr(dL/dU), ...
        optimizer.step()
        

        # START COMPUTING STATS
        
        # add the loss of this batch to the running loss
        running_loss += loss.detach().item()
        
        # compute the error made on this batch and add it to the running error       
        error = utils.get_error( scores.detach() , minibatch_label)
        running_error += error.item()
        
        num_batches+=1        
    
    
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    total_error = running_error/num_batches
    elapsed = (time.time()-start)/60
    

    print('epoch=',epoch, '\t time=', elapsed,'min','\t lr=', my_lr  ,'\t loss=', total_loss , '\t error=', total_error*100 ,'percent')
    eval_on_test_set() 
    print(' ')

RuntimeError: Calculated padded input size per channel: (2 x 2). Kernel size: (3 x 140234977181699). Kernel size can't be greater than actual input size at /Users/soumith/miniconda2/conda-bld/pytorch_1532623076075/work/aten/src/THNN/generic/SpatialConvolutionMM.c:48