In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from random import randint
import utils
import time

In [2]:
device= torch.device("cuda")
#device= torch.device("cpu")
print(device)

cuda


In [3]:
from utils import check_cifar_dataset_exists
data_path=check_cifar_dataset_exists()

train_data=torch.load(data_path+'cifar/train_data.pt')
train_label=torch.load(data_path+'cifar/train_label.pt')
test_data=torch.load(data_path+'cifar/test_data.pt')
test_label=torch.load(data_path+'cifar/test_label.pt')

print(train_data.size())
print(test_data.size())

torch.Size([50000, 3, 32, 32])
torch.Size([10000, 3, 32, 32])


In [4]:
class resnet_50(nn.Module):

    def __init__(self):
        
        super(resnet_50, self).__init__()
        
        #self.padding = nn.ZeroPad2d((96,96,96,96))
        self.resize = nn.UpsamplingBilinear2d(size=(224,224))
            
        # Assuming starting image to be 3x224x224 in the comments below
        #block-0
        #3 x 224 x 224 --> 64 x 112 x 112 
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3 )
        self.BN1 = nn.BatchNorm2d(64, affine = True, eps= 0.0001, momentum= 0.9)
        
        #64 x 112 x 112 --> 64 x 56 x 56, kernel size = 3, stride = 2
        self.pool1  = nn.MaxPool2d(kernel_size=3, stride=2, padding=1 )
        
        
        #block-1
        #64 x 56 x 56 --> 64 x 56 x 56 
        self.conv2_x1 = nn.Conv2d(64, 64, kernel_size=1, stride=1, padding=0 )
        self.BN2_x1 = nn.BatchNorm2d(64, affine = True, eps= 0.0001, momentum= 0.9)
        
        #64 x 56 x 56 --> 64 x 56 x 56 
        self.conv2_x2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1 )
        self.BN2_x2 = nn.BatchNorm2d(64, affine = True, eps= 0.0001, momentum= 0.9)
        
        #64 x 56 x 56 --> 256 x 56 x 56 
        self.conv2_x3 = nn.Conv2d(64, 256, kernel_size=1, stride=1, padding=0 )
        self.BN2_x3 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)


        
        #256 x 56 x 56 --> 64 x 56 x 56 
        self.conv2_x4 = nn.Conv2d(256, 64, kernel_size=1, stride=1, padding=0 )
        self.BN2_x4 = nn.BatchNorm2d(64, affine = True, eps= 0.0001, momentum= 0.9)


        
        #64 x 56 x 56 --> 64 x 56 x 56 
        self.conv2_x5 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1 )
        self.BN2_x5 = nn.BatchNorm2d(64, affine = True, eps= 0.0001, momentum= 0.9)


        
        #64 x 56 x 56 --> 256 x 56 x 56 
        self.conv2_x6 = nn.Conv2d(64, 256, kernel_size=1, stride=1, padding=0 )
        self.BN2_x6 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)
        
        #256 x 56 x 56 --> 64 x 56 x 56 
        self.conv2_x7 = nn.Conv2d(256, 64, kernel_size=1, stride=1, padding=0 )
        self.BN2_x7 = nn.BatchNorm2d(64, affine = True, eps= 0.0001, momentum= 0.9)
        
        #64 x 56 x 56 --> 64 x 56 x 56 
        self.conv2_x8 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1 )
        self.BN2_x8 = nn.BatchNorm2d(64, affine = True, eps= 0.0001, momentum= 0.9)
        
        #64 x 56 x 56 --> 256 x 56 x 56 
        self.conv2_x9 = nn.Conv2d(64, 256, kernel_size=1, stride=1, padding=0 )
        self.BN2_x9 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)
        
        
        
        
        #block-2
        #256 x 56 x 56 --> 128 x 28 x 28, stride = 2 
        self.conv3_x10 = nn.Conv2d(256, 128, kernel_size=1, stride=2, padding=0 )
        self.BN2_x10 = nn.BatchNorm2d(128, affine = True, eps= 0.0001, momentum= 0.9)
        
        #128 x 28 x 28 --> 128 x 28 x 28 
        self.conv3_x11 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1 )
        self.BN2_x11 = nn.BatchNorm2d(128, affine = True, eps= 0.0001, momentum= 0.9)
        
        #128 x 56 x 56 --> 512 x 28 x 28 
        self.conv3_x12 = nn.Conv2d(128, 512, kernel_size=1, stride=1, padding=0 )
        self.BN2_x12 = nn.BatchNorm2d(512, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 28 x 28 --> 128 x 28 x 28 
        self.conv3_x13 = nn.Conv2d(512, 128, kernel_size=1, stride=1, padding=0 )
        self.BN2_x13 = nn.BatchNorm2d(128, affine = True, eps= 0.0001, momentum= 0.9)
        
        #128 x 28 x 28 --> 128 x 28 x 28 
        self.conv3_x14 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1 )
        self.BN2_x14 = nn.BatchNorm2d(128, affine = True, eps= 0.0001, momentum= 0.9)
        
        #128 x 56 x 56 --> 512 x 28 x 28 
        self.conv3_x15 = nn.Conv2d(128, 512, kernel_size=1, stride=1, padding=0 )
        self.BN2_x15 = nn.BatchNorm2d(512, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 28 x 28 --> 128 x 28 x 28 
        self.conv3_x16 = nn.Conv2d(512, 128, kernel_size=1, stride=1, padding=0 )
        self.BN2_x16 = nn.BatchNorm2d(128, affine = True, eps= 0.0001, momentum= 0.9)
        
        #128 x 28 x 28 --> 128 x 28 x 28 
        self.conv3_x17 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1 )
        self.BN2_x17 = nn.BatchNorm2d(128, affine = True, eps= 0.0001, momentum= 0.9)
        
        #128 x 56 x 56 --> 512 x 28 x 28 
        self.conv3_x18 = nn.Conv2d(128, 512, kernel_size=1, stride=1, padding=0 )
        self.BN2_x18 = nn.BatchNorm2d(512, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 28 x 28 --> 128 x 28 x 28 
        self.conv3_x19 = nn.Conv2d(512, 128, kernel_size=1, stride=1, padding=0 )
        self.BN2_x19 = nn.BatchNorm2d(128, affine = True, eps= 0.0001, momentum= 0.9)
        
        #128 x 28 x 28 --> 128 x 28 x 28 
        self.conv3_x20 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1 )
        self.BN2_x20 = nn.BatchNorm2d(128, affine = True, eps= 0.0001, momentum= 0.9)
        
        #128 x 28 x 28 --> 512 x 28 x 28 
        self.conv3_x21 = nn.Conv2d(128, 512, kernel_size=1, stride=1, padding=0 )
        self.BN2_x21 = nn.BatchNorm2d(512, affine = True, eps= 0.0001, momentum= 0.9)
        
        
        
        #block-3
        #512 x 28 x 28 --> 256 x 14 x 14, stride = 2 
        self.conv4_x22 = nn.Conv2d(512, 256, kernel_size=1, stride=2, padding=0 )
        self.BN2_x22 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 14 x 14 --> 256 x 14 x 14 
        self.conv4_x23 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1 )
        self.BN2_x23 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)
        
        #256 x 14 x 14 --> 1024 x 14 x 14 
        self.conv4_x24 = nn.Conv2d(256, 1024, kernel_size=1, stride=1, padding=0 )
        self.BN2_x24 = nn.BatchNorm2d(1024, affine = True, eps= 0.0001, momentum= 0.9)
        
        #1024 x 28 x 28 --> 256 x 14 x 14 
        self.conv4_x25 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0 )
        self.BN2_x25 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 14 x 14 --> 256 x 14 x 14 
        self.conv4_x26 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1 )
        self.BN2_x26 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)
        
        #256 x 14 x 14 --> 1024 x 14 x 14 
        self.conv4_x27 = nn.Conv2d(256, 1024, kernel_size=1, stride=1, padding=0 )
        self.BN2_x27 = nn.BatchNorm2d(1024, affine = True, eps= 0.0001, momentum= 0.9)
        
        #1024 x 28 x 28 --> 256 x 14 x 14 
        self.conv4_x28 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0 )
        self.BN2_x28 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 14 x 14 --> 256 x 14 x 14 
        self.conv4_x29 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1 )
        self.BN2_x29 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)
        
        #256 x 14 x 14 --> 1024 x 14 x 14 
        self.conv4_x30 = nn.Conv2d(256, 1024, kernel_size=1, stride=1, padding=0 )
        self.BN2_x30 = nn.BatchNorm2d(1024, affine = True, eps= 0.0001, momentum= 0.9)
        
        #1024 x 28 x 28 --> 256 x 14 x 14 
        self.conv4_x31 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0 )
        self.BN2_x31 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 14 x 14 --> 256 x 14 x 14 
        self.conv4_x32 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1 )
        self.BN2_x32 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)
        
        #256 x 14 x 14 --> 1024 x 14 x 14 
        self.conv4_x33 = nn.Conv2d(256, 1024, kernel_size=1, stride=1, padding=0 )
        self.BN2_x33 = nn.BatchNorm2d(1024, affine = True, eps= 0.0001, momentum= 0.9)
        
        #1024 x 28 x 28 --> 256 x 14 x 14 
        self.conv4_x34 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0 )
        self.BN2_x34 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 14 x 14 --> 256 x 14 x 14 
        self.conv4_x35 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1 )
        self.BN2_x35 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)
        
        #256 x 14 x 14 --> 1024 x 14 x 14 
        self.conv4_x36 = nn.Conv2d(256, 1024, kernel_size=1, stride=1, padding=0 )
        self.BN2_x36 = nn.BatchNorm2d(1024, affine = True, eps= 0.0001, momentum= 0.9)
        
        #1024 x 28 x 28 --> 256 x 14 x 14 
        self.conv4_x37 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0 )
        self.BN2_x37 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 14 x 14 --> 256 x 14 x 14 
        self.conv4_x38 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1 )
        self.BN2_x38 = nn.BatchNorm2d(256, affine = True, eps= 0.0001, momentum= 0.9)
        
        #256 x 14 x 14 --> 1024 x 14 x 14 
        self.conv4_x39 = nn.Conv2d(256, 1024, kernel_size=1, stride=1, padding=0 )
        self.BN2_x39 = nn.BatchNorm2d(1024, affine = True, eps= 0.0001, momentum= 0.9)
        
       
    
        #block-4
        #1024 x 14 x 14 --> 512 x 7 x 7, stride = 2 
        self.conv5_x40 = nn.Conv2d(1024, 512, kernel_size=1, stride=2, padding=0 )
        self.BN2_x40 = nn.BatchNorm2d(512, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 7 x 7 --> 512 x 7 x 7 
        self.conv5_x41 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1 )
        self.BN2_x41 = nn.BatchNorm2d(512, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 7 x 7 --> 2048 x 7 x 7 
        self.conv5_x42 = nn.Conv2d(512, 2048, kernel_size=1, stride=1, padding=0 )
        self.BN2_x42 = nn.BatchNorm2d(2048, affine = True, eps= 0.0001, momentum= 0.9)
        
        #2048 x 7 x 7 --> 512 x 7 x 7 
        self.conv5_x43 = nn.Conv2d(2048, 512, kernel_size=1, stride=1, padding=0 )
        self.BN2_x43 = nn.BatchNorm2d(512, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 7 x 7 --> 512 x 7 x 7 
        self.conv5_x44 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1 )
        self.BN2_x44 = nn.BatchNorm2d(512, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 7 x 7 --> 2048 x 7 x 7 
        self.conv5_x45 = nn.Conv2d(512, 2048, kernel_size=1, stride=1, padding=0 )
        self.BN2_x45 = nn.BatchNorm2d(2048, affine = True, eps= 0.0001, momentum= 0.9)
        
        #2048 x 7 x 7 --> 512 x 7 x 7 
        self.conv5_x46 = nn.Conv2d(2048, 512, kernel_size=1, stride=1, padding=0 )
        self.BN2_x46 = nn.BatchNorm2d(512, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 7 x 7 --> 512 x 7 x 7 
        self.conv5_x47 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1 )
        self.BN2_x47 = nn.BatchNorm2d(512, affine = True, eps= 0.0001, momentum= 0.9)
        
        #512 x 7 x 7 --> 2048 x 7 x 7 
        self.conv5_x48 = nn.Conv2d(512, 2048, kernel_size=1, stride=1, padding=0 )
        self.BN2_x48 = nn.BatchNorm2d(2048, affine = True, eps= 0.0001, momentum= 0.9)
        
        
        #Average Pooling
        #2048 x 7 x 7 --> 2048 x 1 x 1, kernel size = 7
        self.pool2  = nn.AvgPool2d(kernel_size=7, stride=1, padding=0 )
        
        #Fully Connected Layer
        self.fcc_linear = nn.Linear(2048, 1000)
        
    
    def forward(self,x):
        
        #x = self.padding(x)
        x = self.resize(x)
        #print(x.size())
        #block-0
        y1 = self.conv1(x)
        y1 = self.BN1(y1)
        y1 = F.relu(y1)
        #print(y1.size())
        y2 = self.pool1(y1)
        #print(y2.size())
        
        #block-1
        y3 = self.conv2_x1(y2)
        y3 = self.BN2_x1(y3)
        y3 = F.relu(y3)
        #print(y3.size())
        y4 = self.conv2_x2(y3)
        y4 = self.BN2_x2(y4)
        y4 = F.relu(y4)
        #print(y4.size())
        y5 = self.conv2_x3(y4)
        y5 = self.BN2_x3(y5)
        y5 = F.relu(y5)
        #print(y5.size())
        y6 = self.conv2_x4(y5)
        y6 = self.BN2_x4(y6)
        y6 = F.relu(y6)
        #print(y6.size())
        y7 = self.conv2_x5(y6)
        y7 = self.BN2_x5(y7)
        y7 = F.relu(y7)
        #print(y7.size())
        y8 = self.conv2_x6(y7)
        y8 = self.BN2_x6(y8)
        y8 = F.relu(y8)
        #print(y8.size())
        y9 = self.conv2_x7(y8)
        y9 = self.BN2_x7(y9)
        y9 = F.relu(y9)
        #print(y9.size())
        y10 = self.conv2_x8(y9)
        y10 = self.BN2_x8(y10)
        y10 = F.relu(y10)
        #print(y10.size())
        y11 = self.conv2_x9(y10)
        y11 = self.BN2_x9(y11)
        y11 = F.relu(y11)
        #print(y11.size())
        
        #block-2
        y12 = self.conv3_x10(y11)
        y12 = self.BN2_x10(y12)
        y12 = F.relu(y12)
        #print(y12.size())
        y13 = self.conv3_x11(y12)
        y13 = self.BN2_x11(y13)
        y13 = F.relu(y13)
        #print(y13.size())
        y14 = self.conv3_x12(y13)
        y14 = self.BN2_x12(y14)
        y14 = F.relu(y14)
        #print(y14.size())
        y15 = self.conv3_x13(y14)
        y15 = self.BN2_x13(y15)
        y15 = F.relu(y15)
        #print(y15.size())
        y16 = self.conv3_x14(y15)
        y16 = self.BN2_x14(y16)
        y16 = F.relu(y16)
        #print(y16.size())
        y17 = self.conv3_x15(y16)
        y17 = self.BN2_x15(y17)
        y17 = F.relu(y17)
        #print(y17.size())
        y18 = self.conv3_x16(y17)
        y18 = self.BN2_x16(y18)
        y18 = F.relu(y18)
        #print(y18.size())
        y19 = self.conv3_x17(y18)
        y19 = self.BN2_x17(y19)
        y19 = F.relu(y19)
        #print(y19.size())
        y20 = self.conv3_x18(y19)
        y20 = self.BN2_x18(y20)
        y20 = F.relu(y20)
        #print(y20.size())
        y21 = self.conv3_x19(y20)
        y21 = self.BN2_x19(y21)
        y21 = F.relu(y21)
        #print(y21.size())
        y22 = self.conv3_x20(y21)
        y22 = self.BN2_x20(y22)
        y22 = F.relu(y22)
        #print(y22.size())
        y23 = self.conv3_x21(y22)
        y23 = self.BN2_x21(y23)
        y23 = F.relu(y23)
        #print(y23.size())
        
        #block-3
        y24 = self.conv4_x22(y23)
        y24 = self.BN2_x22(y24)
        y24 = F.relu(y24)
        #print(y24.size())
        y25 = self.conv4_x23(y24)
        y25 = self.BN2_x23(y25)
        y25 = F.relu(y25)
        #print(y25.size())
        y26 = self.conv4_x24(y25)
        y26 = self.BN2_x24(y26)
        y26 = F.relu(y26)
        #print(y26.size())
        y27 = self.conv4_x25(y26)
        y27 = self.BN2_x25(y27)
        y27 = F.relu(y27)
        #print(y27.size())
        y28 = self.conv4_x26(y27)
        y28 = self.BN2_x26(y28)
        y28 = F.relu(y28)
        #print(y28.size())
        y29 = self.conv4_x27(y28)
        y29 = self.BN2_x27(y29)
        y29 = F.relu(y29)
        #print(y29.size())
        y30 = self.conv4_x28(y29)
        y30 = self.BN2_x28(y30)
        y30 = F.relu(y30)
        #print(y30.size())
        y31 = self.conv4_x29(y30)
        y31 = self.BN2_x29(y31)
        y31 = F.relu(y31)
        #print(y31.size())
        y32 = self.conv4_x30(y31)
        y32 = self.BN2_x30(y32)
        y32 = F.relu(y32)
        #print(y32.size())
        
        y33 = self.conv4_x31(y32)
        y33 = self.BN2_x31(y33)
        y33 = F.relu(y33)
        #print(y33.size())
        y34 = self.conv4_x32(y33)
        y34 = self.BN2_x32(y34)
        y34 = F.relu(y34)
        #print(y34.size())
        y35 = self.conv4_x33(y34)
        y35 = self.BN2_x33(y35)
        y35 = F.relu(y35)
        #print(y35.size())
        y36 = self.conv4_x34(y35)
        y36 = self.BN2_x34(y36)
        y36 = F.relu(y36)
        #print(y36.size())
        y37 = self.conv4_x35(y36)
        y37 = self.BN2_x35(y37)
        y37 = F.relu(y37)
        #print(y37.size())
        y38 = self.conv4_x36(y37)
        y38 = self.BN2_x36(y38)
        y38 = F.relu(y38)
        #print(y38.size())
        y39 = self.conv4_x37(y38)
        y39 = self.BN2_x37(y39)
        y39 = F.relu(y39)
        #print(y39.size())
        y40 = self.conv4_x38(y39)
        y40 = self.BN2_x38(y40)
        y40 = F.relu(y40)
        #print(y40.size())
        y41 = self.conv4_x39(y40)
        y41 = self.BN2_x39(y41)
        y41 = F.relu(y41)
        #print(y41.size())
        
        
        #block-4
        y42 = self.conv5_x40(y41)
        y42 = self.BN2_x40(y42)
        y42 = F.relu(y42)
        #print(y42.size())
        y43 = self.conv5_x41(y42)
        y43 = self.BN2_x41(y43)
        y43 = F.relu(y43)
        #print(y43.size())
        y44 = self.conv5_x42(y43)
        y44 = self.BN2_x42(y44)
        y44 = F.relu(y44)
        #print(y44.size())
        y45 = self.conv5_x43(y44)
        y45 = self.BN2_x43(y45)
        y45 = F.relu(y45)
        #print(y45.size())
        y46 = self.conv5_x44(y45)
        y46 = self.BN2_x44(y46)
        y46 = F.relu(y46)
        #print(y46.size())
        y47 = self.conv5_x45(y46)
        y47 = self.BN2_x45(y47)
        y47 = F.relu(y47)
        #print(y47.size())
        y48 = self.conv5_x46(y47)
        y48 = self.BN2_x46(y48)
        y48 = F.relu(y48)
        #print(y48.size())
        y49 = self.conv5_x47(y48)
        y49 = self.BN2_x47(y49)
        y49 = F.relu(y49)
        #print(y49.size())
        y50 = self.conv5_x48(y49)
        y50 = self.BN2_x48(y50)
        y50 = F.relu(y50) 
        #print(y50.size())
        y51 = self.pool2(y50)
        #print(y51.size())

        y51 = y51.view(y51.size(0), -1)
        #print(y51)
        
        y52 = self.fcc_linear(y51)
        #print(y52.size())
        
        y53 = F.softmax(y52, dim=0)
        
        return y52

In [5]:
net = resnet_50()

In [6]:
print(net)

resnet_50(
  (resize): UpsamplingBilinear2d(size=(224, 224), mode=bilinear)
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
  (BN1): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (conv2_x1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
  (BN2_x1): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
  (conv2_x2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (BN2_x2): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
  (conv2_x3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1))
  (BN2_x3): BatchNorm2d(256, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
  (conv2_x4): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1))
  (BN2_x4): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
  (conv2_x5): Conv2d(64, 

In [7]:
utils.display_num_param(net)

There are 22803176 (22.80 million) parameters in this neural network


In [8]:
net = net.to(device)

In [9]:
criterion = nn.CrossEntropyLoss()
my_lr=0.1 
bs= 20

In [10]:
def eval_on_test_set():

    running_error=0
    num_batches=0

    for i in range(0,10000,bs):

        minibatch_data =  test_data[i:i+bs]
        minibatch_label= test_label[i:i+bs]

        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        inputs = minibatch_data

        scores=net( inputs ) 

        error = utils.get_error( scores , minibatch_label)

        running_error += error.item()

        num_batches+=1

    total_error = running_error/num_batches
    print( 'error rate on test set =', total_error*100 ,'percent')

In [11]:
start=time.time()

for epoch in range(1,100):
    
    # divide the learning rate by 5 after every 2 epochs
    if (epoch%4 == 0):
        my_lr = my_lr / 2
    
    # create a new optimizer at the beginning of each epoch: give the current learning rate.   
    optimizer=torch.optim.Adam( net.parameters() , lr=my_lr )
        
    # set the running quatities to zero at the beginning of the epoch
    running_loss=0
    running_error=0
    num_batches=0
    
    # set the order in which to visit the image from the training set
    shuffled_indices=torch.randperm(50000)
 
    for count in range(0,50000,bs):
    
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch       
        indices=shuffled_indices[count:count+bs]
        minibatch_data =  train_data[indices]
        minibatch_label=  train_label[indices]
        
        # send them to the gpu
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        # normalize the minibatch (this is the only difference compared to before!)
        #inputs = (minibatch_data - mean)/std
        
        inputs = minibatch_data
        
        # tell Pytorch to start tracking all operations that will be done on "inputs"
        inputs.requires_grad_()

        # forward the minibatch through the net 
        scores=net( inputs ) 

        # Compute the average of the losses of the data points in the minibatch
        loss =  criterion( scores , minibatch_label) 
        
        # backward pass to compute dL/dU, dL/dV and dL/dW   
        loss.backward()

        # do one step of stochastic gradient descent: U=U-lr(dL/dU), V=V-lr(dL/dU), ...
        optimizer.step()
        

        # START COMPUTING STATS
        
        # add the loss of this batch to the running loss
        running_loss += loss.detach().item()
        
        # compute the error made on this batch and add it to the running error       
        error = utils.get_error( scores.detach() , minibatch_label)
        running_error += error.item()
        
        num_batches+=1        
    
    
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    total_error = running_error/num_batches
    elapsed = (time.time()-start)/60
    

    print('epoch=',epoch, '\t time=', elapsed,'min','\t lr=', my_lr  ,'\t loss=', total_loss , '\t error=', total_error*100 ,'percent')
    eval_on_test_set() 
    print(' ')



epoch= 1 	 time= 17.846264497439066 min 	 lr= 0.1 	 loss= 2.1791982806205747 	 error= 84.53999995470048 percent
error rate on test set = 80.96000012159348 percent
 
epoch= 2 	 time= 36.63544690608978 min 	 lr= 0.1 	 loss= 1.992851670217514 	 error= 81.45400013923646 percent
error rate on test set = 81.09000005722045 percent
 
epoch= 3 	 time= 55.37512944936752 min 	 lr= 0.1 	 loss= 1.9480866791725158 	 error= 80.76000012159348 percent
error rate on test set = 80.61000012159347 percent
 
epoch= 4 	 time= 74.10253558953603 min 	 lr= 0.05 	 loss= 1.8928846162319184 	 error= 79.39000008821488 percent
error rate on test set = 78.29000009298325 percent
 
epoch= 5 	 time= 92.78186634381612 min 	 lr= 0.05 	 loss= 1.8794912262916565 	 error= 79.10400007486344 percent
error rate on test set = 78.19000004529954 percent
 
epoch= 6 	 time= 111.4538219968478 min 	 lr= 0.05 	 loss= 1.8654043487071992 	 error= 78.2040000653267 percent
error rate on test set = 77.66999998092652 percent
 
epoch= 7 	 tim

error rate on test set = 57.469998848438266 percent
 
epoch= 50 	 time= 935.029021537304 min 	 lr= 2.44140625e-05 	 loss= 1.4911854817390442 	 error= 56.51999878883361 percent
error rate on test set = 57.4699988603592 percent
 
epoch= 51 	 time= 953.7713818510373 min 	 lr= 2.44140625e-05 	 loss= 1.4919105861902238 	 error= 56.929998834133144 percent
error rate on test set = 57.50999879837037 percent
 
epoch= 52 	 time= 972.5555908401807 min 	 lr= 1.220703125e-05 	 loss= 1.4924953724861145 	 error= 56.8079988360405 percent
error rate on test set = 57.52999886274338 percent
 
epoch= 53 	 time= 991.3301404436429 min 	 lr= 1.220703125e-05 	 loss= 1.4919778292417527 	 error= 56.85399884462357 percent
error rate on test set = 57.51999883651734 percent
 
epoch= 54 	 time= 1010.1123695691426 min 	 lr= 1.220703125e-05 	 loss= 1.491373980140686 	 error= 56.96399882555008 percent
error rate on test set = 57.53999888896942 percent
 
epoch= 55 	 time= 1028.886374191443 min 	 lr= 1.220703125e-05 	 l

KeyboardInterrupt: 