In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import config
import numpy as np
from models import Convsfig as Fully_convsfig
from utils import *
from matplotlib import pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import CIFAR10
from torchsummary import summary
from tqdm import notebook
tqdm = notebook.tqdm

CIFAR10 Dataset and Dataloader

In [2]:
train_dataset = CIFAR10("CIFAR10", train=True, transform=tranform, download=True) # Set download to False if you already have dataset downloaded
test_dataset = CIFAR10("CIFAR10", train=False, transform=tranform, download=True)
loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, pin_memory=True, shuffle=config.SHUFFLE, num_workers=config.NUM_WORKERS)
test_loader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE, pin_memory=True, shuffle=config.SHUFFLE, num_workers=config.NUM_WORKERS)


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to CIFAR10\cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting CIFAR10\cifar-10-python.tar.gz to CIFAR10
Files already downloaded and verified


To achieve our task on exploring different hyperparameters' effect on model, we define a simple convolutional neural networks below.
It has three hidden layers(two convolution layers and a fully connected layer at the end).
Each conv layer has a pooling layer and an activation function at the end.
Adding batchnorm or switching activation function by hyperparameters.

In the upcoming section, we will train this model with a list of different parameters, then compare their different in performance.

The torch summary library seems bugged in our assignment, it displays and count the parameters from some layers twice.

In [3]:
# Make model by config list
# list: [f, k, s, p], conv
# tuple: (k, s, p), pooling
# int: channels, Linear(fully_connected)
# str: single flatten or global avg pooling

layers = [[64, 9, 1, 4], # A block of convolutional layer with 64 output channels, kernel size 9, stride 1, padding 4
          (2, 2, 0), # A pooling layer with kernel size 2, stride 2.
          [128, 9, 1, 4],# A block of convolutional layer with 128 output channels, kernel size 9, stride 1, padding 4
          (2, 2, 0),# A pooling layer with kernel size 2, stride 2.
          10]# fully connected layers with 10 output
model = Fully_convsfig(configs = layers).to(config.DEVICE)
summary(model,(3,32,32))

acc, time = KFold_validation(model, train_dataset, shuffle=False, val_epochs=5, split=config.KFOLD_SPLIT, lr=config.LEARNING_RATE)
conv_acc_list = sum(acc)/len(acc)
print(f"Cross_val_score:{conv_acc_list}   time:{time}")


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 32, 32]          15,616
            Conv2d-2           [-1, 64, 32, 32]          15,616
              ReLU-3           [-1, 64, 32, 32]               0
              ReLU-4           [-1, 64, 32, 32]               0
         ConvBlock-5           [-1, 64, 32, 32]               0
         MaxPool2d-6           [-1, 64, 16, 16]               0
            Conv2d-7          [-1, 128, 16, 16]         663,680
            Conv2d-8          [-1, 128, 16, 16]         663,680
              ReLU-9          [-1, 128, 16, 16]               0
             ReLU-10          [-1, 128, 16, 16]               0
        ConvBlock-11          [-1, 128, 16, 16]               0
        MaxPool2d-12            [-1, 128, 8, 8]               0
          Flatten-13                 [-1, 8192]               0
           Linear-14                   

Testing:[1]: 100%|██████████████████████████████████████████| 40/40 [00:03<00:00, 11.65it/s, accuracy=0.667, loss=1.21]
                                                                                                                       

KeyboardInterrupt: 

In [None]:
#Config of baseline model
layers = [[64, 9, 1, 4],(2,2,0),[128, 9, 1, 4],(2,2,0),10]

Loss and accuracy curve of baseline model

In [None]:

model = Fully_convsfig(configs = layers).to(config.DEVICE)
draw_loss_acc_curve(model, loader, test_loader,"base_model")

learning rate 1e-2

In [None]:
model = Fully_convsfig(configs = layers).to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"lr_1e-2",lr=1e-2)

learning rate 1e-6

In [None]:
model = Fully_convsfig(configs = layers).to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"lr_1e-6",lr=1e-6)

learning_rate decay


In [None]:
milestone = [15,18,20,22,24]
init_lr = 0.1
gamma = 0.1

model = Fully_convsfig(configs = layers).to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"lr_decay",lr=init_lr, lr_decay=True, epochs=200, milestone=milestone, gamma=gamma)


BatchNorm with batch size 16

In [None]:
model = Fully_convsfig(configs = layers,use_bn=True).to(config.DEVICE)
batch16_loader = DataLoader(train_dataset, batch_size=16, pin_memory=True, shuffle=config.SHUFFLE, num_workers=config.NUM_WORKERS)
summary(model,(3,32,32))
draw_loss_acc_curve(model, batch16_loader, test_loader,"batchnorm_16")

BatchNorm with batch size 64

In [None]:
model = Fully_convsfig(configs = layers,use_bn=True).to(config.DEVICE)
batch16_loader = DataLoader(train_dataset, batch_size=64, pin_memory=True, shuffle=config.SHUFFLE, num_workers=config.NUM_WORKERS)
summary(model,(3,32,32))
draw_loss_acc_curve(model, batch16_loader, test_loader,"batchnorm_64")

Batch Norm with batch size 256

In [None]:
model = Fully_convsfig(configs = layers,use_bn=True).to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"batchnorm_256")

Avg pooling

In [None]:
model = Fully_convsfig(configs = layers,pooling="avg").to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"avg_pooling")

He initialization

In [None]:
model = Fully_convsfig(configs = layers,init="He").to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"He_init")

Xavier initialization

In [None]:
model = Fully_convsfig(configs = layers,init="Xavier").to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"Xavier_init")

Xavier with TanH activation

In [None]:
model = Fully_convsfig(configs = layers,init="Xavier",activation=nn.Tanh()).to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"Xavier_TanH_init")

TanH activation

In [None]:
model = Fully_convsfig(configs = layers,activation=nn.Tanh()).to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"TanH")

without activation

In [None]:
model = Fully_convsfig(configs = layers,activation=nn.Identity()).to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"without_act")


Drop out rate 0.2

In [None]:
model = Fully_convsfig(configs = layers,dropout=0.2).to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"drop_out_0.2")

Drop out rate 0.7

In [None]:
model = Fully_convsfig(configs = layers,dropout=0.7).to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"drop_out_0.7")

L2 regularization

In [None]:
model = Fully_convsfig(configs = layers).to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"weight_decay_1e-4", weight_decay=1e-4)

In [None]:
model = Fully_convsfig(configs = layers).to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"weight_decay_1e-3", weight_decay=1e-3)

In [None]:
model = Fully_convsfig(configs = layers).to(config.DEVICE)
summary(model,(3,32,32))
draw_loss_acc_curve(model, loader, test_loader,"weight_decay_1e-2", weight_decay=1e-2)

Mixup training

In [None]:
layers = [[64, 9, 1, 4],(2,2,0),[128, 9, 1, 4],(2,2,0),10]
model = Fully_convsfig(configs = layers).to(config.DEVICE)

train_loss = []
test_loss = []
train_acc = []
test_acc = []
optimizer = optim.Adam(model.parameters(),  lr=config.LEARNING_RATE)
for epoch in range(150):
    #Mix up training
    loss = mix_up_training(model, optimizer, loader, epoch, non_linearity = "sigmoid")
    train_loss.append(loss)
    #training accuracy
    acc, _ = test(model, loader, epoch)
    train_acc.append(acc)
    #testin loss and accuracy
    acc, loss = test(model, test_loader, epoch)
    test_loss.append(loss)
    test_acc.append(acc)
torch.save((train_loss, test_loss, train_acc, test_acc), "result/convs_curve_mix_up_softmax.tar")

expected optimal training

In [None]:
layers = [[64, 9, 1, 4],(2,2,0),[128, 9, 1, 4],(2,2,0),10]
model = Fully_convsfig(configs = layers,use_bn=True).to(config.DEVICE)

mix_up_train_dataset = CIFAR10("CIFAR10", train=True, transform=tranform_aug)

mixup_loader = DataLoader(train_dataset, batch_size=256, pin_memory=True, shuffle=config.SHUFFLE, num_workers=config.NUM_WORKERS)


train_loss = []
test_loss = []
train_acc = []
test_acc = []
optimizer = optim.Adam(model.parameters(),  lr=config.LEARNING_RATE)
#
for epoch in range(150):
    #Mix up training
    loss = mix_up_training(model, optimizer, mixup_loader, epoch, non_linearity = "softmax")
    train_loss.append(loss)
    #training accuracy
    acc, _ = test(model, loader, epoch)
    train_acc.append(acc)
    #testin loss and accuracy
    acc, loss = test(model, test_loader, epoch)
    test_loss.append(loss)
    test_acc.append(acc)
torch.save((train_loss, test_loss, train_acc, test_acc), "result/convs_curve_mix_up_softmax_aug.tar")