In [41]:
# Download CIFAR 10 dataset for training and validation purposes and apply the following changes on each image:
# 1) make it a tensor
# 2) normalize it based on the mean and standard deviation among all pixels in each channel (RGB).
# Print the size of training and validation datasets

from torchvision import datasets
from torchvision import transforms
import torch

# default dataset needed for normalization:
dataset_train = datasets.CIFAR10('.', train=True, download=True, transform=transforms.ToTensor())

# calculation values for normalization:
imgs = torch.stack([img for (img, label) in dataset_train], dim = 3)
mean_deviation = imgs.view(3, -1).mean(dim=1)
standard_deviation = imgs.view(3, -1).std(dim=1)

# normalized datasets:
dataset_train_trans = datasets.CIFAR10('.', train=True, download=True, transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean_deviation.numpy(), standard_deviation.numpy())
]))
dataset_val_trans = datasets.CIFAR10('.', train=False, download=True, transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean_deviation.numpy(), standard_deviation.numpy())
]))

# print lengths:
len_dataset_train_trans = len(dataset_train_trans)
len_dataset_val_trans = len(dataset_val_trans)

print(len_dataset_train_trans, len_dataset_val_trans)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
50000 10000


In [42]:
# We want to make a tertiary classifier that distinguishes between deers, dogs, and horses, labeled as 4, 5, and 7, resp.
# Create the subset training and validation datasets for this purpose.
# Print the size of these datasets.

label_map = {4:0, 5:1, 7:2}
class_names = ["deers", "dogs", "horses"]

dataset_train_filtered  =   [(img, label_map[label]) for img, label in dataset_train_trans if label in [4,5,7]]
dataset_val_filtered    =   [(img, label_map[label]) for img, label in dataset_val_trans if label in [4,5,7]]

# print lengths:
len_dataset_train = len(dataset_train_filtered)
len_dataset_val = len(dataset_val_filtered)

print(len_dataset_train, len_dataset_val)

15000 3000


In [43]:
#Create a parameterized CNN with the following details. 
# The parameter is the number of output channels n after the first convolution.
# All kernels are of size 3 by 3.
# Convolutions must not change the height and width.
# Each convolution is followed by hyperbolic tangent as the activation function, and max pooling of size 2 by 2.
# Convolution ayers:
# 1) First convolution layer works on the input RGB input. Let's assume there are n kernels in this layer.
# 2) Second convolution layer works on the result of the preceding max pooling layer. 
#    Let's assume there are n/2 kernels in this layer.
# 3) Third convolution layer works on the result of the preceding max pooling layer. 
#    Let's assume there are n/2 kernels in this layer. 
# Fully connected layers:
# 1) First fully connected layer works on the result of the preceding max pooling layer. 
#    This layer is followed by hyperbolic tangent as its activation function.
# 2) Second fully connected layer works on the result of the preceding activation function, and emits numbers associated
#    with each class.
# We will use negative log likelihood to compute the loss. So you may add additional layer(s) to your network.
# Note: Since the network is parameterized (n), you'd rather define the CNN as a subclass of nn.Module.

import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, n = 32):
        super().__init__()
        self.n = n
        
        self.conv1 = nn.Conv2d(3, self.n, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(self.n, (self.n//2), kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d((self.n//2), (self.n//2), kernel_size=3, padding=1)

        self.fc1 = nn.Linear((self.n//2)*4*4, 32)
        self.fc2 = nn.Linear(32, 3)
        
        self.lsftmx = nn.LogSoftmax(dim=1)

    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        out = F.max_pool2d(torch.tanh(self.conv3(out)), 2)

        out = out.view(-1, (self.n//2)*4*4)
        out = torch.tanh(self.fc1(out))
        out = self.lsftmx(self.fc2(out))
        return out

In [44]:
# Create two networks as instances of the CNN you defined above, with n = 16 and n = 32 respectively. 
# Print the total number of parameters in each of these instances.

model_16 = Net(16)
model_32 = Net(32)

sum([param.numel() for param in model_16.parameters()]), sum([param.numel() for param in model_32.parameters()])

(6419, 16163)

In [45]:
# Our training functionality is supposed to compute gradient on batches of training data, randlomy selected each time.
# To this end, create a training data loader with batch size 32 that randomizes access to each batch.
# Also, create a validation data loader with the same batch size that does not randomize access to each batch (no need!)
# Print the number of batches in training and validation data loaders

train_loader = torch.utils.data.DataLoader(dataset_train_filtered, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset_val_filtered, batch_size=32, shuffle=False)

print(len(train_loader), len(val_loader))

469 94


In [46]:
#Define your training function that receives the training loader, model, loss function, optimizer, the device (cpu/gpu), and 
# number of epochs.
#In each epoch, you should go through each training data batch, and:
# 1) move data to device
# 1) compute the output batch, and accordingly the loss
# 2) compute the gradient of loss wrt parameters, and update the parameters
#After covering all epochs, your training function must report the training accuracy

def training_loop(n_epochs, model, loss_fn, optimizer, train_loader, device):
    for epoch in range(1, n_epochs + 1):
        for (imgs, labels) in train_loader:
            imgs = imgs.to(device=device)
            labels = labels.to(device=device)

            outs = model(imgs)
            loss = loss_fn(outs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if epoch % 10 == 0:
            print("epoch=%d loss=%f" %(epoch, loss)) # loss of last batch

    total = 0
    correct = 0
    for imgs, labels in train_loader:
        imgs = imgs.to(device=device)
        labels = labels.to(device=device)

        outs = model(imgs)
        max_vals, max_classes = outs.max(dim=1)

        total += imgs.shape[0]
        correct += (max_classes == labels).sum()

    print("training accuracy =%f" %(correct/total))

In [47]:
#Define a separate function that receives the validation data loader as well as the model and computes the validation 
# accuracy of the model.

def validate(model, loader, device):
    total = 0
    correct = 0
    for (imgs, lbls) in loader:
        imgs = imgs.to(device=device)
        lbls = lbls.to(device=device)

        outs = model(imgs)
        max_vals, max_indexes = outs.max(dim=1)
        
        correct += ((max_indexes == lbls).sum())
        total += imgs.shape[0]
    print("accuracy %f" %(correct/total))

In [56]:
#Define device dynamically based on whether CUDA is available or not.
#Call the training function on the created training data loader, the created CNN  with n = 16, 
# negative log likelihood loss function, stochastic gradient descent optimizer,
# the device you defined, and 100 epochs. Next, call validation accuracy function.
#Is the model overfit? (Yes/No) Why?

import torch.optim as optim

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_16 = Net(16).to(device=device)

training_loop(
    n_epochs=100,
    model=model_16,
    train_loader=train_loader,
    loss_fn=nn.NLLLoss(),
    optimizer=optim.SGD(model_16.parameters(), lr=0.01),
    device=device
)

validate(model_16, val_loader, device=device)

# Answer:
#
# No. Considering the obtained model accuracy data, we can conclude that the model is not overfitted.
# Accuracy of 89% for training data and 80% for validation data indicates that the model does more than just memorize data and can handle unknown data. 

epoch=10 loss=0.729366
epoch=20 loss=0.644168
epoch=30 loss=0.792817
epoch=40 loss=0.568522
epoch=50 loss=0.620974
epoch=60 loss=0.183326
epoch=70 loss=0.478958
epoch=80 loss=0.174433
epoch=90 loss=0.429307
epoch=100 loss=0.208847
training accuracy =0.890000
accuracy 0.802667


In [57]:
#Call the training function on the created training data loader, the created CNN  with n = 32, 
# negative log likelihood loss function, stochastic gradient descent optimizer,
# the device you defined, and 100 epochs. Next, call validation accuracy function.
#Is the model overfit? (Yes/No) Why? 
# (This can be compared to the fully connected network we created in the last set of exercises.)

model_32 = Net(32).to(device=device)

training_loop(
    n_epochs=100,
    model=model_32,
    train_loader=train_loader,
    loss_fn=nn.NLLLoss(),
    optimizer=optim.SGD(model_32.parameters(), lr=0.01),
    device=device
)

validate(model_32, val_loader, device=device)

# Answer:
#
# Yes. Considering the obtained model accuracy data, we can assume that the model might be overfitted.
# The big gap between accuracy of 98% for training data and 80% for validation data indicates 
# that the model performs worse on unknown data. 

epoch=10 loss=0.706127
epoch=20 loss=0.266369
epoch=30 loss=0.505593
epoch=40 loss=0.148179
epoch=50 loss=0.235886
epoch=60 loss=0.102419
epoch=70 loss=0.057331
epoch=80 loss=0.202786
epoch=90 loss=0.146545
epoch=100 loss=0.031223
training accuracy =0.981067
accuracy 0.808000


In [58]:
#Next, let's consider L2 regularization with weight decay 0.002 for CNN with n = 32. 
# Is the model overfit? (Yes/No) Why?

model_32 = Net(32).to(device=device)

training_loop(
    n_epochs=100,
    model=model_32,
    train_loader=train_loader,
    loss_fn=nn.NLLLoss(),
    optimizer=optim.SGD(model_32.parameters(), lr=0.01, weight_decay=0.004),
    device=device
)

validate(model_32, val_loader, device=device)

# Answer:
#
# No. Considering the obtained model accuracy data, we can conclude that the model is not overfitted.
# The relatively small gap between accuracy of 93% for training data and 82% for validation data indicates
# that the model does more than just memorize data and can handle unknown data.

epoch=10 loss=0.536367
epoch=20 loss=0.753943
epoch=30 loss=0.210761
epoch=40 loss=0.345711
epoch=50 loss=0.427637
epoch=60 loss=0.227456
epoch=70 loss=0.208162
epoch=80 loss=0.400528
epoch=90 loss=0.345810
epoch=100 loss=0.071054
training accuracy =0.932400
accuracy 0.818333


In [59]:
#Add a skip connection in your CNN from the output of second max pooling to the input of 3rd max pooling.
#Train the updated CNN with the same parameters including (n = 32).
#Is the model overfit? (Yes/No) Why?

class NetSkip(nn.Module):
    def __init__(self, n = 32):
        super().__init__()
        self.n = n
        
        self.conv1 = nn.Conv2d(3, self.n, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(self.n, (self.n//2), kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d((self.n//2), (self.n//2), kernel_size=3, padding=1)

        self.fc1 = nn.Linear((self.n//2)*4*4, 32)
        self.fc2 = nn.Linear(32, 3)
        
        self.lsftmx = nn.LogSoftmax(dim=1)

    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)

        skip = out
        out = F.max_pool2d(torch.tanh(self.conv3(out)) + skip, 2) # skip node

        out = out.view(-1, (self.n//2)*4*4)
        out = torch.tanh(self.fc1(out))
        out = self.lsftmx(self.fc2(out))
        return out

In [60]:
model_32 = NetSkip(32).to(device=device)

training_loop(
    n_epochs=100,
    model=model_32,
    train_loader=train_loader,
    loss_fn=nn.NLLLoss(),
    optimizer=optim.SGD(model_32.parameters(), lr=0.01),
    device=device
)

validate(model_32, val_loader, device=device)

# Answer:
#
# Yes. Considering the obtained model accuracy data, we can assume that the model might be overfitted.
# The big gap between accuracy of 100% for training data and 81% for validation data indicates 
# that the model performs worse on unknown data. 

epoch=10 loss=0.341694
epoch=20 loss=0.285952
epoch=30 loss=0.430676
epoch=40 loss=0.239788
epoch=50 loss=0.262548
epoch=60 loss=0.091029
epoch=70 loss=0.049518
epoch=80 loss=0.063178
epoch=90 loss=0.035940
epoch=100 loss=0.149305
training accuracy =0.996467
accuracy 0.812000


In [61]:
#Consider dropout layers after each max pooling in the original CNN, where the probability of zeroing output features is 30%.
#Train the updated CNN with the same parameters including (n = 32).
#Is the model overfit? (Yes/No) Why?

class NetDropout(nn.Module):
    def __init__(self, n = 32):
        super().__init__()
        self.n = n
        
        self.conv1 = nn.Conv2d(3, self.n, kernel_size=3, padding=1)
        self.conv1_dropout = nn.Dropout2d(p=0.3)
        self.conv2 = nn.Conv2d(self.n, (self.n//2), kernel_size=3, padding=1)
        self.conv2_dropout = nn.Dropout2d(p=0.3)
        self.conv3 = nn.Conv2d((self.n//2), (self.n//2), kernel_size=3, padding=1)
        self.conv3_dropout = nn.Dropout2d(p=0.3)

        self.fc1 = nn.Linear((self.n//2)*4*4, 32)
        self.fc2 = nn.Linear(32, 3)
        
        self.lsftmx = nn.LogSoftmax(dim=1)

    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = self.conv1_dropout(out)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        out = self.conv2_dropout(out)
        out = F.max_pool2d(torch.tanh(self.conv3(out)), 2)
        out = self.conv3_dropout(out)

        out = out.view(-1, (self.n//2)*4*4)
        out = torch.tanh(self.fc1(out))
        out = self.lsftmx(self.fc2(out))
        return out


In [62]:
model_32 = NetDropout(32).to(device=device)

training_loop(
    n_epochs=100,
    model=model_32,
    train_loader=train_loader,
    loss_fn=nn.NLLLoss(),
    optimizer=optim.SGD(model_32.parameters(), lr=0.01),
    device=device
)

validate(model_32, val_loader, device=device)

# Answer:
#
# No. Considering the obtained model accuracy data, we can conclude that the model is not overfitted.
# The small gap between accuracy of 80% for training data and 78% for validation data indicates
# that the model does more than just memorize data and can handle unknown data. The model works the same with new data as with training data.

epoch=10 loss=0.883767
epoch=20 loss=0.600886
epoch=30 loss=0.845863
epoch=40 loss=0.680702
epoch=50 loss=0.601182
epoch=60 loss=0.502269
epoch=70 loss=0.528790
epoch=80 loss=0.604622
epoch=90 loss=1.034282
epoch=100 loss=0.576950
training accuracy =0.803600
accuracy 0.781667


In [53]:
#Considering all the modifications which one works better? Plain CNN, CNN+L2, CNN+Skip, CNN+Dropout?

# Answer:
#
# Model:        train_acc   val_acc
# CNN:          98%         80%
# CNN+L2:       93%         81%
# CNN+Skip:     100%        81%
# CNN+Dropout:  80%         78%
#
# All models show good accuracy results. An indicator of the effectiveness of working with unknown data is validation accuracy.
# CNN+Dropout shows the lowest validation accuracy. However, it also has the smallest gap between training and validation accuracies.
# This means that the model works the same with new data as with training data.
# In my opinion, CNN+Dropout method is most suitable for preventing overfitting.