In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import collections

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.set_printoptions(edgeitems=2)
torch.manual_seed(42)

<torch._C.Generator at 0x7f607f61f310>

# Classifying objects

In the following, we will use the old-school CIFAR-10 dataset that contains low-resolution pictures of objects of 10 categories. This - and other - dataset is available as part of the `torchvision` package, which you should install.

In [2]:
class_names = ['airplane','automobile','bird','cat','deer',
               'dog','frog','horse','ship','truck']

In [3]:
from torchvision import datasets, transforms, utils
# where to put the data
data_path = './'

# this constructs a CIFAR10 dataset, selects the training part of it, 
# downloads it if necessary, and adds additional transforms that we 
# will need to convert each image to a pytorch tensor AND to provide
# a nice conversion of the RGB images into greyscale
cifar10 = datasets.CIFAR10(
    data_path, train=True, download=True,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4915, 0.4823, 0.4468),
                             (0.2470, 0.2435, 0.2616))
    ]))

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./cifar-10-python.tar.gz to ./


This has now downloaded the images if not already done so - you will notice that this is quite a "hefty" dataset already at 170MB. Now let's download our validation set or test set.

In [4]:
cifar10_val = datasets.CIFAR10(
    data_path, train=False, download=True,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4915, 0.4823, 0.4468),
                             (0.2470, 0.2435, 0.2616))
    ]))

Files already downloaded and verified


## Two-class problem on the GPU

One of the reasons for the success of CNNs has been the realization that computation can be done efficiently on GPUs - originally designed to help with calculation of 3D graphics. 

So, let's push everything onto the GPU now. Let's redefine the same two-class problem as before:

In [5]:
label_map = {0: 0, 2: 1}
class_names = ['airplane', 'bird']
cifar2 = [(img, label_map[label])
          for img, label in cifar10
          if label in [0, 2]]
cifar2_val = [(img, label_map[label])
              for img, label in cifar10_val
              if label in [0, 2]]

In [6]:
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64, shuffle=True)

In [10]:
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(8 * 8 * 8, 32)
        self.fc2 = nn.Linear(32, 2)
        
    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        out = out.view(-1, 8 * 8 * 8)
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

Now, let's define a device that consists of CUDA (Compute Unified Device Architecture - a standard for general-purpose computing on GPU devices first introduced by NVIDIA in 2007) if supported, or the CPU otherwise.

It is considered good standard to put code similar to this at the beginning of any script so that your code will run no matter if a GPU is present or not.

In [7]:
device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
print(f"Training on device {device}.")

Training on device cuda.


We now have to modify our training loop slightly - the only difference is that we will need to tell Pytorch explicitly where the data will need to be processed.

**Note that if data or model reside in different places, Pytorch will produce a run-time error!!**

In [8]:
import datetime 

def training_loop(n_epochs, optimizer, model, loss_fn, train_loader):
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for imgs, labels in train_loader:
            # put the data onto the correct pytorch calculation device
            imgs = imgs.to(device = device)
            labels =labels.to(device = device)
            # put a batch through the model
            outputs = model(imgs)
            # determine the loss
            loss = loss_fn(outputs, labels)
            # zero the gradients and determine backpropagation
            optimizer.zero_grad()
            loss.backward()
            # do one step of optimization
            optimizer.step()
            # keep track of the loss
            loss_train += loss.item()

        if epoch == 1 or epoch % 10 == 0:
            print('{} Epoch {}, Training loss {}'.format(
                datetime.datetime.now(), epoch,
                loss_train / len(train_loader)))

Now, let's train this (small) convolutional neural network with this training_loop class for 100 epochs:

In [11]:
from torch.cuda import device_of
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64,
                                           shuffle=True)

# get the model and put it onto the proper device
model = Net().to(device =device)
# standard optimizer 
optimizer = optim.SGD(model.parameters(), lr=1e-2) 
# classification loss
loss_fn = nn.CrossEntropyLoss()

# go for 100 epochs
training_loop(
    n_epochs = 100,
    optimizer = optimizer,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader,
)

2022-11-14 06:36:46.548054 Epoch 1, Training loss 0.5481963028573686
2022-11-14 06:36:49.812214 Epoch 10, Training loss 0.3275259368738551
2022-11-14 06:36:53.340026 Epoch 20, Training loss 0.28943249118176234
2022-11-14 06:36:56.858684 Epoch 30, Training loss 0.2651279278716464
2022-11-14 06:37:00.347545 Epoch 40, Training loss 0.2503316754558284
2022-11-14 06:37:03.840861 Epoch 50, Training loss 0.23127564004841883
2022-11-14 06:37:07.359849 Epoch 60, Training loss 0.21492449327069482
2022-11-14 06:37:10.918820 Epoch 70, Training loss 0.20169869905254642
2022-11-14 06:37:14.432324 Epoch 80, Training loss 0.18960698309597696
2022-11-14 06:37:17.909665 Epoch 90, Training loss 0.17358152748672825
2022-11-14 06:37:21.409495 Epoch 100, Training loss 0.15804541708937117


Compared to our previous version this takes less time - the exact speed-up depends of course on your GPU/CPU combination. 

The validation also needs to change accordingly - here, we will also need to put the data onto the device. Note that the call to the `predicted` variable below also produces a device-fixed output!

In [12]:
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64,
                                           shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64,
                                         shuffle=False)

def validate(model, train_loader, val_loader):
    results = []
    for name, loader in [("train", train_loader), ("val", val_loader)]:
        correct = 0
        total = 0

        with torch.no_grad():
            for imgs, labels in loader:
                imgs = imgs.to(device= device)
                labels = labels.to(device = device)
                outputs = model(imgs)
                _, predicted = torch.max(outputs, dim=1)
                total += labels.shape[0]
                correct += int((predicted == labels).sum())

        print("Accuracy {}: {:.2f}".format(name , correct / total))
        results.append(correct / total)

    return (*results,)

validate(model, train_loader, val_loader)

Accuracy train: 0.94
Accuracy val: 0.88


## Optimizing CNNs

In the following, we will discuss a few basic ways how to advance the architecture of CNNs.

### Width

One of the easiest ways to enhance the capacity of a CNN is of course to change the "width" of the network. This means that you will add more filters to a layer. Changing the width can be done, for example, like so:

In [13]:
class NetWidth(nn.Module):
    def __init__(self, n_ch1=32):
        super().__init__()
        # needed later in the forward function
        self.n_ch1 = n_ch1
        self.conv1 = nn.Conv2d(3, n_ch1, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(n_ch1, n_ch1 //2, kernel_size=3,padding=1)
        self.fc1 = nn.Linear(8*8*n_ch1//2, 32)
        self.fc2 = nn.Linear(32, 2)
    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        out = out.view(-1, 8*8*self.n_ch1 // 2)
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

### Dropout

As we increase the number of parameters, however, we will quickly see that it become easy to overfit the NN on pretty much any data. The increased capacity of a wider network is something that we would like to keep nonetheless, so how can we do this?

One solution for this is the so-called **dropout** - a technique proposed by Srivastava and Hinton in 2014. 

The idea for this is actually very simple and implemented like a layer in Pytorch: in every iteration of the training, you zero out a random fraction of outputs of the preceding layer to reduce their influence. 

What this does is to train slightly different "models" in each iteration that try to solve your task, preventing the individual filters (neurons) from talking to each other too much and forming overfitting connections. 

This technique is not dependent on CNNs actually and can also be used with fully-connected neural networks, btw. 

In the case of Pytorch and CNNs we can specify 2D dropouts or 3D dropouts that zero entire channel outputs.

In order to make the network function properly, we have to be aware, however, whether we are training (dropout should be active) or evaluating (dropout should not be active or have probability zero). In Pytorch, you can control this via the parameters `model.train()` and `model.eval()` for a `nn.Model` subclass like ours.

In [None]:
class NetDropout(nn.Module):
    def __init__(self, n_ch1=32):
        super().__init__()
        # needed later in the forward function
        self.n_ch1 = n_ch1
        self.conv1 = nn.Conv2d(3, n_ch1, kernel_size=3, padding=1)
        # add the dropout layer
        self.conv1_dropout = nn.Dropout2d(p=0.4)
        self.conv2 = nn.Conv2d(n_ch1, n_ch1 // 2, kernel_size=3, padding=1)
        # add the dropout layer
        self.conv2_dropout = nn.Dropout2d(p=0.4)
        self.fc1 = nn.Linear(8 * 8 * n_ch1 // 2, 32)
        self.fc2 = nn.Linear(32, 2)
    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        # add call to dropout
        out = self.conv1_dropout(out)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        # add call to dropout
        out = self.conv2_dropout(out)
        out = out.view(-1, 8 * 8 * self.n_ch1 // 2)
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

### Batch normalization

Another idea, published in 2015 by Ioffe and Szegedy, is Batch Normalization - a technique that rescales the inputs to the activation functions of the networks so that each (mini)batch has a desired, nicely-behaving distribution. 

Why would we want to do this? Remember that activation functions have saturation points that may prevent efficient learning - we therefore would like to present the activation function with a range of values that make full use of the optimization gradients.

In practice, this normalization is done by shifting and scaling an intermediate input using mean and standard deviation across the samples of a given (mini)batch.

This results in a regularization of sorts as any individual sample and its following activations are viewed by the full model as shifted and scaled.

As such, the normalization was proposed by the authors to obviate the need for dropout as the regularization through shifting and scaling was supposed to help also with overfitting.

The place for the batch norm layer in Pytorch is directly before the activation function. Again, as values for the layers are automatically updated with each call, we have to be aware, however, whether we are training (batch norm should be active) or evaluating (batch norm should not be active). In Pytorch, you can control this via the parameters `model.train()` and `model.eval()` for a `nn.Model` subclass like ours.

In [None]:
class NetBatchNormalization(nn.Module):
    def __init__(self, n_ch1=32):
        super().__init__()
        # needed later in the forward function
        self.n_ch1 = n_ch1
        # note that it is also possible to use "bias=False" here, as the 
        # batch normalization layer "cancels" the effect of any bias - see below
        self.conv1 = nn.Conv2d(3, n_ch1, kernel_size=3, padding=1, bias = "False")
        # add the batch normalization layer
        self.conv1_batchnorm = nn.BatchNorm2d(num_features = n_ch1)
        self.conv2 = nn.Conv2d(n_ch1, n_ch1 // 2, kernel_size=3, padding=1, bias = "False")
        # add the batch normalization layer
        self.conv2_batchnorm = nn.BatchNorm2d(num_features = n_ch1 // 2)
        self.fc1 = nn.Linear(8 * 8 * n_ch1 // 2, 32)
        self.fc2 = nn.Linear(32, 2)
    def forward(self, x):
        # add call to batch normalization layer **before activation func**
        out = self.conv1_batchnorm(self.conv1(out))
        out = F.max_pool2d(torch.tanh(out), 2)
        # add call to batch normalization layer **before activation func**
        out = self.conv2_batchnorm(self.conv2(out))
        out = F.max_pool2d(torch.tanh(out), 2)
        out = out.view(-1, 8 * 8 * self.n_ch1 // 2)
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

### Depth

Maybe you have wondered why we did not talk about going deeper until now - after all, it's called "deep learning"... 

What are the advantages of going deep again? Depth allows you to discover hierarchies of features - although a shallow network is in principle - via the Universal Approximation Theorem - capable of solving any task for you, splitting the network into layers enables it to discover hierarchical structures in your data. 

Example: a shallow network will be able to tell apart birds from airplanes, but a "deep" network may be able to tell you that birds and airplanes consist of different parts (wings and bodies in both cases, eyes and legs for birds, windows and perhaps landing gears for airplanes, etc.), hence allowing you to describe the data with more structure. 

So why don't we simply go deep? The reason for that was already hinted at several times previously: each additional layer means that gradients in backpropagation will need to be multiplied, and if you are in the tails of the activation functions that will lead to saturation and to either exploding or vanishing gradients, which will make learning unstable or not moving forward.

We have already mentioned batch normalization as one possible way to overcome this problem, and, indeed, this will help us to construct deeper networks.

Another approach was presented in 2015 by He et al., with their publication of residual networks (ResNets), which uses the trick of adding a "skip connection" to a layer.

#### Skip connections

A skip connection is simply the addition of the input to the output of a layer. Like so:

In [None]:
class NetSkip(nn.Module):
    def __init__(self, n_ch1=32):
        super().__init__()
        # needed later in the forward function
        self.n_ch1 = n_ch1
        self.conv1 = nn.Conv2d(3, n_ch1, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(n_ch1, n_ch1 // 2, kernel_size=3, padding=1)
        # go deeper!
        self.conv3 = nn.Conv2d(n_ch1 //2, n_ch1//2, kernel_size = 3, padding = 1)
        # we have now half the size from before
        self.fc1 = nn.Linear(4 * 4 * n_ch1 // 2, 32)
        self.fc2 = nn.Linear(32, 2)
    def forward(self, x):
        out = F.max_pool2d(torch.relu(self.conv1(x)), 2)
        out = F.max_pool2d(torch.relu(self.conv2(out)), 2)
        # save the input to layer3
        ln_layer3 = out
        # and add it to the output
        out = F.max_pool2d(torch.relu(self.conv3d(out)) + ln_layer3, 2)
        out = out.view(-1, 4 * 4 * self.n_ch1 // 2)
        out = torch.relu(self.fc1(out))
        out = self.fc2(out)
        return out

Good, so now we have added another layer, and we have also added its input to its output. Because we have done this, the skip connection is now part of the computational graph and hence part of the loss gradient path.

Since they are more directly connected to the loss (note, they are outside the activation functions!), their addition to the network means that the optimizer can decide to bypass a layer. The effect of this is that gradients across multiple layers are less likely to vanish.

#### Blocks of layers

If we really want to go deep, we need a better way to initialize our networks, as the "manual" way of adding layers that we have done so far is not going to be feasible.

We do this by first defining a `ResNetBlock` class, which includes one block of convolutions, activations, and skip connections. In addition, we will add back batch normalization and also add a special type of weight initialization that aids optimization.

In [15]:
class ResBlock(nn.Module):
    def __init__(self, n_ch):
        super(ResBlock, self).__init__()
        # here we now explicitly get rid of the biases as we use BN
        self.conv = nn.Conv2d(n_ch, n_ch, kernel_size = 3, padding =1, bias = False)
        # batch normalization 
        self.batch_norm = nn.BatchNorm2d(num_features=n_ch)
        # BN is initialized to have 0.5 "variance" and 0 mean
        # 이거 안하면 -1 과 1 사이로 initialization 됨
        torch.nn.init.constant_(self.batch_norm.weight, 0.5)
        torch.nn.init.zeros_(self.batch_norm.bias)
        # this is a special initialization of the convolution weights 
        # that was found to help with optimization
        torch.nn.init.kaiming_normal_(self.conv.weight, nonlinearity='relu')
    def forward(self, x):
        # simple forward function that includes skip connection
        out = self.conv(x)
        out = self.batch_norm(out)
        out = torch.relu(out)
        return out + x

Now that we have one block, we will use this to create a deep architecture like so:

In [16]:
class NetResDeep(nn.Module):
    # the constructor now has a parameter n_blocks that determines how many
    # ResBlocks will be used
    def __init__(self, n_ch1=32, n_blocks=10):
        super().__init__()
        self.n_ch1 = n_ch1
        # initial convolution for embedding
        self.conv1 = nn.Conv2d(3, n_ch1, kernel_size=3, padding=1)
        # add the ResBlocks to the network
        # nn.Sequential makes sure that we can add them together nicely
        self.resblocks = nn.Sequential(*(n_blocks * [ResBlock(n_ch = n_ch1)]))
        # note that the "net" effect of the blocks does not reduce the dimensionality
        # of the features - hence, we again have 8x8 "pixels" resolution here
        self.fc1 = nn.Linear(8*8*n_ch1, 32)
        self.fc2 = nn.Linear(32, 2)
    def forward(self, x):
        out = F.max_pool2d(torch.relu(self.conv1(x)), 2)
        out = self.resblocks(out)
        out = F.max_pool2d(out, 2)
        out = out.view(-1, 8 * 8 * self.n_ch1)
        out = torch.relu(self.fc1(out))
        out = self.fc2(out)
        return out

In [None]:
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64,
                                           shuffle=True)

# get the model and put it onto the proper device
model = NetResDeep().to(device=device)
# standard optimizer 
optimizer = optim.SGD(model.parameters(), lr=1e-3) 
# classification loss
loss_fn = nn.CrossEntropyLoss()

# go for 100 epochs
training_loop(
    n_epochs = 100,
    optimizer = optimizer,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader,
)

In [None]:
validate(model, train_loader, val_loader)

As we can see, however, although the last - and possibly most advanced design - is easily capable of achieving 100% training accuracy, its validation accuracy is still roughly the same. 

This is in part due to the fact that we've tried to go deep on images that only have 32x32 pixels - discovering hierarchies in such a low-dimensional input space may therefore be limited. 

## 1x1 convolutions

Another architecture element that is often used in CNNs is the 1x1 convolution layer. It is often called a projection or embedding layer as well.

Now, mathematically, a 1x1 convolution basically takes each input and weights it with a single number, producing another number. So, there is no influence of neighboring elements in this operation - it is purely **local**.

Note that if we treat a 1x1 convolution as a layer, however, its output can be fed into an activation function, which in turn means that the full layer can perform complex, non-linear operations on inputs.

In addition, note, that this layer can perform its operations along the channel dimension, generating again a single number as output across all input channels. This means that in a deep architecture, the 1x1 operation will basically "summarize" (other words that are often used are "project" / "embed" / "pool" in this context) the full set of channels into one feature map, preserving its width x height dimensions. 

The following is an example of a projection, in which the dimensionality is preserved:

In [None]:
class NetWidthProject(nn.Module):
    def __init__(self, n_ch1=32):
        super().__init__()
        # needed later in the forward function
        self.n_ch1 = n_ch1
        self.conv1 = nn.Conv2d(3, n_ch1, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(n_ch1, n_ch1 // 2, kernel_size=3,padding=1)
        # keeps the number of filters
        self.conv3 = nn.Conv2d(n_ch1 // 2, n_ch1 //2, kernel_size=1)
        self.fc1 = nn.Linear(8 * 8 * n_ch1 // 2, 32)
        self.fc2 = nn.Linear(32, 2)
    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        # Conv1d -> Keep the number of filter
        out = torch.tanh(self.conv3(out))
        out = out.view(-1, 8 * 8 * self.n_ch1 // 2)
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

In [None]:
model = NetWidthProject().to(device=device)
print(model)

numel_list = [p.numel() for p in model.parameters()]
sum(numel_list), numel_list

The following is an example of how to use the 1x1 layer for embedding or dimensionality reduction:

In [None]:
class NetWidthEmbed(nn.Module):
    def __init__(self, n_ch1=32):
        super().__init__()
        # needed later in the forward function
        self.n_ch1 = n_ch1
        self.conv1 = nn.Conv2d(3, n_ch1, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(n_ch1, n_ch1 // 2, kernel_size=3,padding=1)
        # reduces the number of filters by 2
        # Reduce dimenionality ; Embed or summarize to lower dimension
        self.conv3 = nn.Conv2d(n_ch1 // 2, n_ch1 //4, kernel_size=1)
        self.fc1 = nn.Linear(8 * 8 * n_ch1 // 4, 32)
        self.fc2 = nn.Linear(32, 2)
    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        out = torch.tanh(self.conv3(out))
        out = out.view(-1, 8 * 8 * self.n_ch1 // 4)
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

In [None]:
model = NetWidthEmbed().to(device=device)
print(model)

numel_list = [p.numel() for p in model.parameters()]
sum(numel_list), numel_list

In [None]:
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64, shuffle=True)

# get the model and put it onto the proper device
model = NetWidthEmbed().to(device=device)
# standard optimizer 
optimizer = optim.SGD(model.parameters(), lr=1e-2) 
# classification loss
loss_fn = nn.CrossEntropyLoss()

# go for 100 epochs
training_loop(
    n_epochs = 1000,
    optimizer = optimizer,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader,
)

Just in the same way is it can be used for dimensionality reduction, the 1x1 convolution can also be used for dimensionality increase! This can be useful for upscaling images or feature maps.

The idea of using these filters was first proposed in 2013 and put to full use in the 2014 Inception architecture.

In the original ResNet architecture, for example, these have also been used to introduce 3x3 "bottleneck" layers, in which a 1x1 convolution first decreased the dimensionality and then increased it again after the actual 3x3 convolutions like so:

In [None]:
# reduces the number of filters
self.convNNm1 = nn.Conv2d(n_ch1, n_ch1 // 4, kernel_size=1)
# applies 3x3 convolution 
self.convNN = nn.Conv2d(3, n_ch1 // 4, kernel_size=3, padding=1)
# upsamples again
self.convNNp1 = nn.Conv2d(n_ch1 // 4, n_ch1, kernel_size=1,padding=1)