In [1]:
## Standard libraries
import os
import numpy as np 
import random
import time
import datetime
from PIL import Image
import collections
from types import SimpleNamespace

## Imports for plotting
import matplotlib.pyplot as plt
%matplotlib inline 
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg', 'pdf') # For export
import matplotlib
matplotlib.rcParams['lines.linewidth'] = 2.0
import seaborn as sns
sns.reset_orig()

## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
# Torchvision
import torchvision
from torchvision.datasets import CIFAR10
from torchvision import transforms

## Progress bar
from tqdm.notebook import tqdm

## Certificate error fix
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

## Remove Warnings
# import warnings
# warnings.filterwarnings('ignore')

print("Using torch", torch.__version__)

  set_matplotlib_formats('svg', 'pdf') # For export


Using torch 1.11.0


In [2]:
torch.manual_seed(42) # Setting the seed

<torch._C.Generator at 0x257006d5ab0>

In [3]:
#Check for cuda GPU to run on:
gpu_avail = torch.cuda.is_available()
print(f"Is the GPU available? {gpu_avail}")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)
if gpu_avail:
    print("Device name: " + torch.cuda.get_device_name(0))

Is the GPU available? True
Device: cuda
Device name: NVIDIA GeForce GTX 1080 Ti


In [4]:
# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)
DATASET_PATH = "../data"

# Function for setting the seed
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
set_seed(42)

if torch.cuda.is_available():
    torch.backends.cudnn.determinstic = True
    torch.backends.cudnn.benchmark = False

In [5]:
#Load dataset, calculate mean and std.dev
train_dataset = CIFAR10(root=DATASET_PATH, train=True, download=True)
DATA_MEANS = (train_dataset.data / 255.0).mean(axis=(0,1,2))
DATA_STD = (train_dataset.data / 255.0).std(axis=(0,1,2))
print("Data mean", DATA_MEANS)
print("Data std", DATA_STD)

Files already downloaded and verified
Data mean [0.49139968 0.48215841 0.44653091]
Data std [0.24703223 0.24348513 0.26158784]


In [6]:
test_transform = transforms.Compose([transforms.ToTensor(),
                                     transforms.Normalize(DATA_MEANS, DATA_STD)
                                     ])
# For training, we add some augmentation. Networks are too powerful and would overfit.
train_transform = transforms.Compose([transforms.RandomHorizontalFlip(),
                                      transforms.RandomResizedCrop((32,32),scale=(0.8,1.0),ratio=(0.9,1.1)),
                                      transforms.ToTensor(),
                                      transforms.Normalize(DATA_MEANS, DATA_STD)
                                     ])
# Loading the training dataset. We need to split it into a training and validation part
# We need to do a little trick because the validation set should not use the augmentation.
train_dataset = CIFAR10(root=DATASET_PATH, train=True, transform=train_transform, download=True)
val_dataset = CIFAR10(root=DATASET_PATH, train=True, transform=test_transform, download=True)
set_seed(42)
train_set, _ = torch.utils.data.random_split(train_dataset, [45000, 5000])
set_seed(42)
_, val_set = torch.utils.data.random_split(val_dataset, [45000, 5000])

# We define a set of data loaders that we can use for various purposes later.
train_loader = data.DataLoader(train_set, batch_size=128, shuffle=True, drop_last=True, pin_memory=True, num_workers=4)
val_loader = data.DataLoader(val_set, batch_size=128, shuffle=False, drop_last=False, num_workers=4)

Files already downloaded and verified
Files already downloaded and verified


In [7]:
#QUESTION 1, PART A
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(8 * 8 * 8, 32)
        self.fc2 = nn.Linear(32, 10)

    def forward(self, x):
        out = F.max_pool2d(torch.relu(self.conv1(x)), 2)
        out = F.max_pool2d(torch.relu(self.conv2(out)), 2)
        out = out.view(-1, 8 * 8 * 8)
        out = torch.relu(self.fc1(out))
        out = self.fc2(out)
        return out

In [8]:
#Training and validation functions for first model:
def training_loop(n_epochs, optimizer, model, loss_fn, train_loader):
    for epoch in tqdm(range(1, n_epochs + 1)):
        loss_train = 0.0
        for imgs, labels in train_loader:
            
            imgs = imgs.to(device=device)
            labels = labels.to(device=device)
            outputs = model(imgs)
            loss = loss_fn(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_train += loss.item()

        if epoch == 1 or epoch % 10 == 0:
            print('{} Epoch {}, Training loss {}'.format(
                datetime.datetime.now(), epoch,
                loss_train / len(train_loader)))
            
def validate(model, train_loader, val_loader):
    for name, loader in [("train", train_loader), ("val", val_loader)]:
        correct = 0
        total = 0

        with torch.no_grad():  # <1>
            for imgs, labels in loader:
                imgs = imgs.to(device=device)
                labels = labels.to(device=device)
                outputs = model(imgs)
                _, predicted = torch.max(outputs, dim=1) # <2>
                total += labels.shape[0]  # <3>
                correct += int((predicted == labels).sum())  # <4>

        print("Accuracy {}: {:.2f}".format(name , correct / total))

In [9]:
model = Net()

numel_list = [p.numel() for p in model.parameters()]
print(sum(numel_list), numel_list)

model = Net().to(device=device)
optimizer = optim.SGD(model.parameters(), lr=1e-2) 
loss_fn = nn.CrossEntropyLoss()  

training_loop(  
    n_epochs = 300,
    optimizer = optimizer,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader,
)

validate(model, train_loader, val_loader)

18354 [432, 16, 1152, 8, 16384, 32, 320, 10]


  0%|          | 0/300 [00:00<?, ?it/s]

2022-03-27 20:08:02.492844 Epoch 1, Training loss 2.273495835796041
2022-03-27 20:09:21.553304 Epoch 10, Training loss 1.4327392109438903
2022-03-27 20:10:57.437005 Epoch 20, Training loss 1.2443940481569014
2022-03-27 20:12:25.269210 Epoch 30, Training loss 1.1418811086236242
2022-03-27 20:14:08.126310 Epoch 40, Training loss 1.0795786954738475
2022-03-27 20:15:36.371722 Epoch 50, Training loss 1.0375436800837177
2022-03-27 20:17:04.616070 Epoch 60, Training loss 1.008249326818689
2022-03-27 20:18:32.505548 Epoch 70, Training loss 0.9804658738636224
2022-03-27 20:20:00.270168 Epoch 80, Training loss 0.9613154876945366
2022-03-27 20:21:28.311250 Epoch 90, Training loss 0.9421492663883416
2022-03-27 20:22:57.390036 Epoch 100, Training loss 0.9301863941032322
2022-03-27 20:24:26.988756 Epoch 110, Training loss 0.9155107347713916
2022-03-27 20:25:56.958539 Epoch 120, Training loss 0.9059064498654118
2022-03-27 20:27:26.833892 Epoch 130, Training loss 0.8981608733152732
2022-03-27 20:28:59

In [10]:
#QUESTION 1, PART B
class NetB(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(8, 4, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(4 * 4 * 4, 32)
        self.fc2 = nn.Linear(32, 10)

    def forward(self, x):
        out = F.max_pool2d(torch.relu(self.conv1(x)), 2)
        out = F.max_pool2d(torch.relu(self.conv2(out)), 2)
        out = F.max_pool2d(torch.relu(self.conv3(out)), 2)
        out = out.view(-1, 4 * 4 * 4)
        out = torch.relu(self.fc1(out))
        out = self.fc2(out)
        return out


In [11]:
model = NetB()

numel_list = [p.numel() for p in model.parameters()]
print(sum(numel_list), numel_list)

model = NetB().to(device=device)
optimizer = optim.SGD(model.parameters(), lr=1e-2) 
loss_fn = nn.CrossEntropyLoss()  

training_loop(  
    n_epochs = 300,
    optimizer = optimizer,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader,
)

validate(model, train_loader, val_loader)

4310 [432, 16, 1152, 8, 288, 4, 2048, 32, 320, 10]


  0%|          | 0/300 [00:00<?, ?it/s]

2022-03-27 20:53:48.006275 Epoch 1, Training loss 2.3039934051002873
2022-03-27 20:55:10.535255 Epoch 10, Training loss 1.8366410586229416
2022-03-27 20:56:41.824127 Epoch 20, Training loss 1.451465807409368
2022-03-27 20:58:13.370967 Epoch 30, Training loss 1.3455406036811677
2022-03-27 20:59:45.430389 Epoch 40, Training loss 1.2813283663529615
2022-03-27 21:01:18.358913 Epoch 50, Training loss 1.2366040116361743
2022-03-27 21:02:50.134393 Epoch 60, Training loss 1.2026796055655193
2022-03-27 21:04:22.033217 Epoch 70, Training loss 1.1744809794290114
2022-03-27 21:05:53.329474 Epoch 80, Training loss 1.154808791614326
2022-03-27 21:07:25.575955 Epoch 90, Training loss 1.1357732281046375
2022-03-27 21:08:56.341949 Epoch 100, Training loss 1.118267748770211
2022-03-27 21:10:27.897075 Epoch 110, Training loss 1.0911153993035994
2022-03-27 21:11:59.766669 Epoch 120, Training loss 1.0807854803199441
2022-03-27 21:13:31.493823 Epoch 130, Training loss 1.0669893011068687
2022-03-27 21:15:03.

In [12]:
#QUESTION 2, PART A
class ResBlock(nn.Module):
    def __init__(self, n_chans):
        super(ResBlock, self).__init__()
        self.conv = nn.Conv2d(n_chans, n_chans, kernel_size=3, padding=1, bias=False)
        #torch.nn.init.kaiming_normal_(self.conv.weight, nonlinearity='relu')

    def forward(self, x):
        out = self.conv(x)
        out = torch.relu(out)
        return out + x #Skip connection

class ResNet10(nn.Module):
    def __init__(self, n_chans1=32, n_blocks=10):
        super().__init__()
        self.n_chans1 = n_chans1
        self.conv1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
        self.resblocks = nn.Sequential(
            *(n_blocks * [ResBlock(n_chans=n_chans1)]))
        self.fc1 = nn.Linear(8 * 8 * n_chans1, 32)
        self.fc2 = nn.Linear(32, 10)
        
    def forward(self, x):
        out = F.max_pool2d(torch.relu(self.conv1(x)), 2)
        out = self.resblocks(out)
        out = F.max_pool2d(out, 2)
        out = out.view(-1, 8 * 8 * self.n_chans1)
        out = torch.relu(self.fc1(out))
        out = self.fc2(out)
        return out
    

In [13]:
model = ResNet10()

numel_list = [p.numel() for p in model.parameters()]
print(sum(numel_list), numel_list)

model = ResNet10().to(device=device)
optimizer = optim.SGD(model.parameters(), lr=3e-3)
loss_fn = nn.CrossEntropyLoss()

training_loop(
    n_epochs = 300,
    optimizer = optimizer,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader,
)

validate(model, train_loader, val_loader)

76010 [864, 32, 9216, 65536, 32, 320, 10]


  0%|          | 0/300 [00:00<?, ?it/s]

2022-03-27 21:40:02.401241 Epoch 1, Training loss 1.9797430626007906
2022-03-27 21:41:38.896724 Epoch 10, Training loss 1.1677336658847297
2022-03-27 21:43:23.349446 Epoch 20, Training loss 0.9765184446957037
2022-03-27 21:45:05.453244 Epoch 30, Training loss 0.8863915761991105
2022-03-27 21:46:49.266637 Epoch 40, Training loss 0.81784396259873
2022-03-27 21:48:34.407654 Epoch 50, Training loss 0.7695781147038495
2022-03-27 21:50:17.148260 Epoch 60, Training loss 0.7313196742296898
2022-03-27 21:52:00.740259 Epoch 70, Training loss 0.7026950527629962
2022-03-27 21:53:44.930398 Epoch 80, Training loss 0.6767093193666888
2022-03-27 21:55:29.906883 Epoch 90, Training loss 0.6593405956726128
2022-03-27 21:57:13.548985 Epoch 100, Training loss 0.6365604016006502
2022-03-27 21:58:55.465481 Epoch 110, Training loss 0.6165899555397849
2022-03-27 22:00:37.769722 Epoch 120, Training loss 0.6061041385699542
2022-03-27 22:02:19.843832 Epoch 130, Training loss 0.5881980646539617
2022-03-27 22:04:01

In [14]:
#QUESTION 2, PART B
    
#Weight Decay training loop:
def training_loop_l2reg(n_epochs, optimizer, model, loss_fn, train_loader):
    for epoch in tqdm(range(1, n_epochs + 1)):
        loss_train = 0.0
        for imgs, labels in train_loader:
            imgs = imgs.to(device=device)
            labels = labels.to(device=device)
            outputs = model(imgs)
            loss = loss_fn(outputs, labels)

            l2_lambda = 0.001
            l2_norm = sum(p.pow(2.0).sum()
                          for p in model.parameters())  # <1>
            loss = loss + l2_lambda * l2_norm

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_train += loss.item()
        if epoch == 1 or epoch % 10 == 0:
            print('{} Epoch {}, Training loss {}'.format(
                datetime.datetime.now(), epoch,
                loss_train / len(train_loader)))
    
#Dropout versions:
class ResBlock_DO(nn.Module):
    def __init__(self, n_chans, p):
        super(ResBlock_DO, self).__init__()
        self.conv = nn.Conv2d(n_chans, n_chans, kernel_size=3,
                              padding=1, bias=False)
        self.dropout = nn.Dropout2d(p = p)
        
    def forward(self, x):
        out = self.conv(x)
        out = self.dropout(out)
        out = torch.relu(out)
        return out + x

class ResNet10_DO(nn.Module):
    def __init__(self, n_chans1=32, n_blocks=10, p=0.3):
        super().__init__()
        self.n_chans1 = n_chans1
        self.conv1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
        self.resblocks = nn.Sequential(
            *(n_blocks * [ResBlock_DO(n_chans=n_chans1, p=p)]))
        self.fc1 = nn.Linear(8 * 8 * n_chans1, 32)
        self.fc2 = nn.Linear(32, 10)
        
    def forward(self, x):
        out = F.max_pool2d(torch.relu(self.conv1(x)), 2)
        out = self.resblocks(out)
        out = F.max_pool2d(out, 2)
        out = out.view(-1, 8 * 8 * self.n_chans1)
        out = torch.relu(self.fc1(out))
        out = self.fc2(out)
        return out

#Batch Normalization versions:
class ResBlock_BN(nn.Module):
    def __init__(self, n_chans):
        super(ResBlock_BN, self).__init__()
        self.conv = nn.Conv2d(n_chans, n_chans, kernel_size=3,
                              padding=1, bias=False)
        self.batch_norm = nn.BatchNorm2d(num_features=n_chans)
        torch.nn.init.kaiming_normal_(self.conv.weight,
                                      nonlinearity='relu')
        torch.nn.init.constant_(self.batch_norm.weight, 0.5)
        torch.nn.init.zeros_(self.batch_norm.bias)

    def forward(self, x):
        out = self.conv(x)
        out = self.batch_norm(out)
        out = torch.relu(out)
        return out + x

class ResNet10_BN(nn.Module):
    def __init__(self, n_chans1=32, n_blocks=10):
        super().__init__()
        self.n_chans1 = n_chans1
        self.conv1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
        self.resblocks = nn.Sequential(
            *(n_blocks * [ResBlock_BN(n_chans=n_chans1)]))
        self.fc1 = nn.Linear(8 * 8 * n_chans1, 32)
        self.fc2 = nn.Linear(32, 10)
        
    def forward(self, x):
        out = F.max_pool2d(torch.relu(self.conv1(x)), 2)
        out = self.resblocks(out)
        out = F.max_pool2d(out, 2)
        out = out.view(-1, 8 * 8 * self.n_chans1)
        out = torch.relu(self.fc1(out))
        out = self.fc2(out)
        return out


In [15]:
#Weight Decay Model Results:
model = ResNet10()

numel_list = [p.numel() for p in model.parameters()]
print(sum(numel_list), numel_list)

model = ResNet10().to(device=device)
optimizer = optim.SGD(model.parameters(), lr=3e-3)
loss_fn = nn.CrossEntropyLoss()

training_loop_l2reg(
    n_epochs = 300,
    optimizer = optimizer,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader,
)

validate(model, train_loader, val_loader)

76010 [864, 32, 9216, 65536, 32, 320, 10]


  0%|          | 0/300 [00:00<?, ?it/s]

2022-03-27 22:32:04.045584 Epoch 1, Training loss 2.06732132631829
2022-03-27 22:33:40.403937 Epoch 10, Training loss 1.1950943663588955
2022-03-27 22:35:26.586354 Epoch 20, Training loss 1.0073482309311543
2022-03-27 22:37:12.937124 Epoch 30, Training loss 0.9065220274476924
2022-03-27 22:38:57.815567 Epoch 40, Training loss 0.8512656824541228
2022-03-27 22:40:43.903413 Epoch 50, Training loss 0.7997998055229839
2022-03-27 22:42:29.953127 Epoch 60, Training loss 0.764399053194584
2022-03-27 22:44:16.623501 Epoch 70, Training loss 0.7323032103033147
2022-03-27 22:46:02.721110 Epoch 80, Training loss 0.7134685900136616
2022-03-27 22:47:48.535444 Epoch 90, Training loss 0.6885397566689385
2022-03-27 22:49:34.095258 Epoch 100, Training loss 0.6665314722944189
2022-03-27 22:51:19.315286 Epoch 110, Training loss 0.65073260469654
2022-03-27 22:53:04.665270 Epoch 120, Training loss 0.6374655513681917
2022-03-27 22:54:51.537534 Epoch 130, Training loss 0.6256482385502242
2022-03-27 22:56:37.08

In [16]:
#Dropout Model Results:
model = ResNet10_DO()

numel_list = [p.numel() for p in model.parameters()]
print(sum(numel_list), numel_list)

model = ResNet10_DO().to(device=device)
optimizer = optim.SGD(model.parameters(), lr=3e-3)
loss_fn = nn.CrossEntropyLoss()

training_loop(
    n_epochs = 300,
    optimizer = optimizer,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader,
)

validate(model, train_loader, val_loader)

76010 [864, 32, 9216, 65536, 32, 320, 10]


  0%|          | 0/300 [00:00<?, ?it/s]

2022-03-27 23:25:45.999684 Epoch 1, Training loss 2.0476226314180597
2022-03-27 23:27:21.213727 Epoch 10, Training loss 1.273833913341207
2022-03-27 23:29:06.743294 Epoch 20, Training loss 1.0910009475175473
2022-03-27 23:30:52.702578 Epoch 30, Training loss 0.9887161056200663
2022-03-27 23:32:37.824351 Epoch 40, Training loss 0.9203331108786102
2022-03-27 23:34:23.391019 Epoch 50, Training loss 0.8666891058286031
2022-03-27 23:36:09.447284 Epoch 60, Training loss 0.8292987061361982
2022-03-27 23:37:54.604488 Epoch 70, Training loss 0.79926557435609
2022-03-27 23:39:39.923635 Epoch 80, Training loss 0.7705867932393
2022-03-27 23:41:25.445445 Epoch 90, Training loss 0.7494173216344284
2022-03-27 23:43:10.033194 Epoch 100, Training loss 0.7276362019386726
2022-03-27 23:44:55.944332 Epoch 110, Training loss 0.7110369973033243
2022-03-27 23:46:41.120827 Epoch 120, Training loss 0.6946421992405187
2022-03-27 23:48:26.677799 Epoch 130, Training loss 0.6807130771314996
2022-03-27 23:50:12.199

In [17]:
#Batch Norm Model Results:
model = ResNet10_BN()

numel_list = [p.numel() for p in model.parameters()]
print(sum(numel_list), numel_list)

model = ResNet10_BN().to(device=device)
optimizer = optim.SGD(model.parameters(), lr=3e-3)
loss_fn = nn.CrossEntropyLoss()

training_loop(
    n_epochs = 300,
    optimizer = optimizer,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader,
)

validate(model, train_loader, val_loader)

76074 [864, 32, 9216, 32, 32, 65536, 32, 320, 10]


  0%|          | 0/300 [00:00<?, ?it/s]

2022-03-28 00:18:47.003608 Epoch 1, Training loss 1.7722551466053367
2022-03-28 00:20:23.385047 Epoch 10, Training loss 1.152346660268952
2022-03-28 00:22:10.889077 Epoch 20, Training loss 1.0021720083690437
2022-03-28 00:23:58.638209 Epoch 30, Training loss 0.913266035396489
2022-03-28 00:25:46.022947 Epoch 40, Training loss 0.8590302934334149
2022-03-28 00:27:34.365174 Epoch 50, Training loss 0.8096600222451734
2022-03-28 00:29:22.343516 Epoch 60, Training loss 0.7688918594281557
2022-03-28 00:31:10.340008 Epoch 70, Training loss 0.7381861104584827
2022-03-28 00:32:59.179251 Epoch 80, Training loss 0.7125981613102123
2022-03-28 00:34:47.713639 Epoch 90, Training loss 0.6877301449110026
2022-03-28 00:36:36.442226 Epoch 100, Training loss 0.6673962031504368
2022-03-28 00:38:25.406701 Epoch 110, Training loss 0.6520670902864886
2022-03-28 00:40:13.321896 Epoch 120, Training loss 0.6382100237400783
2022-03-28 00:42:01.582939 Epoch 130, Training loss 0.6166405405244256
2022-03-28 00:43:50