In [None]:
!pip install torch torchvision

import torch
import torch.nn as nn
import torch.nn.functional as F

## CNN Structure
**How each layer works in a vanilla CNN**

<br>
The following cells contain codes for writing Conv2d, Maxpool2d, Linear and Batchnorm2d from scratch.<br>
They show how each type of layer in a CNN works under the hood.

In [None]:
# Conv2d from scratch
class Conv2d(nn.Module):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 bias=True):

        super(Conv2d, self).__init__()
        """
        An implementation of a convolutional layer.

        The input consists of N data points, each with C channels, height H and
        width W. We convolve each input with F different filters, where each filter
        spans all C channels and has height HH and width WW.

        Parameters:
        - w: Filter weights of shape (F, C, HH, WW)
        - b: Biases, of shape (F,)
        - kernel_size: Size of the convolving kernel
        - stride: The number of pixels between adjacent receptive fields in the
            horizontal and vertical directions.
        - padding: The number of pixels that will be used to zero-pad the input.
        """
        
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.bias = bias
        
        # since kernel_size, stride and padding argument inputs can be either tuple or integer,
        # set all inputs to tuple
        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
        self.stride = stride if isinstance(stride, tuple) else (stride, stride)
        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
        
        # initialise the kernel weights with Kaiming initialisation which is 
        # popularly used in convolutional layers
        std = (2/(out_channels*self.kernel_size[0]*self.kernel_size[1]))**0.5
        self.weight = torch.Tensor(out_channels, in_channels, 
                                   self.kernel_size[0], self.kernel_size[1]).normal_(mean = 0, std = std)
        # please comment out the next line if nn.Parameter() is not needed
        self.weight = nn.Parameter(self.weight)
        
        # initialise bias to be zero as we do not want any bias initially in the network
        if bias:
            self.bias = torch.zeros((out_channels,))
            # please comment out the next line if nn.Parameter() is not needed
            self.bias = nn.Parameter(self.bias)

    def forward(self, x):
        """
        Input:
        - x: Input data of shape (N, C, H, W)
        Output:
        - out: Output data, of shape (N, F, H', W').
        """
        
        # calculate the output image height
        out_dimension_0 = int((x.shape[2] + 2*self.padding[0] - self.kernel_size[0])/self.stride[0] + 1)
        # unfold the input tensor to obtain the image patches
        x = F.unfold(x, kernel_size = self.kernel_size, padding = self.padding, stride = self.stride)
        weight = self.weight.view(self.out_channels, -1)
        
        # use matrix multiplication to carry out convolution
        x = weight @ x
        # reshape output into the desired shape
        out = x.view(x.shape[0], self.out_channels, out_dimension_0, -1)
        
        if torch.is_tensor(self.bias): # add bias if bias is True
            out += self.bias.view(1, self.out_channels, 1, 1)

        return out

In [None]:
# Maxpool2d from scratch
class MaxPool2d(nn.Module):
    def __init__(self, kernel_size):
        super(MaxPool2d, self).__init__()
        """
        An implementation of a max-pooling layer.

        Parameters:
        - kernel_size: the size of the window to take a max over
        """        
        # since kernel_size input can be either tuple or integer,
        # set it to tuple
        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)

    def forward(self, x):
        """
        Input:
        - x: Input data of shape (N, C, H, W)
        Output:
        - out: Output data, of shape (N, F, H', W').
        """
        # store the number of channels
        num_channels = x.shape[1]
        # calculate the output image height
        out_dimension_0 = int((x.shape[2]-self.kernel_size[0])/self.kernel_size[0] + 1)
        # unfold the input tensor to obtain the image patches
        x = F.unfold(x, kernel_size = self.kernel_size, stride = self.kernel_size)
        x = x.view(x.shape[0], num_channels, -1, x.shape[2])
        
        # get the maximum
        x = torch.max(x, dim = 2, keepdim = True).values
        # reshape output into the desired shape
        out = x.view(x.shape[0], num_channels, out_dimension_0, -1)

        return out

In [None]:
# Linear layer from scratch
class Linear(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True):
        super(Linear, self).__init__()
        """
        An implementation of a Linear layer.

        Parameters:
        - weight: the learnable weights of the module of shape (in_channels, out_channels).
        - bias: the learnable bias of the module of shape (out_channels).
        """        
        self.input = in_channels
        self.output = out_channels
        self.bias = bias
        # initialise weights with Xavier Glorot initialization
        xavier = (6/(in_channels + out_channels))**(0.5)
        self.weight = xavier*2*torch.rand(in_channels, out_channels) - xavier
        # please comment out the next line if nn.Parameter() is not needed
        self.weight = nn.Parameter(self.weight)
        
        # initialise bias to be zero
        if bias:
            self.bias = torch.zeros(out_channels)
            # please comment out the next line if nn.Parameter() is not needed
            self.bias = nn.Parameter(self.bias)

    def forward(self, x):
        """
        Input:
        - x: Input data of shape (N, *, H) where * means any number of additional
        dimensions and H = in_channels
        Output:
        - out: Output data of shape (N, *, H') where * means any number of additional
        dimensions and H' = out_channels
        """
        
        # matrix multiplication to get the output
        out = x @ self.weight
        if torch.is_tensor(self.bias): # add bias if bias is True
            out += self.bias

        return out

In [None]:
# Batchnorm2d from scratch
class BatchNorm2d(nn.Module):
    def __init__(self, num_features, eps=1e-05, momentum=0.1):
        super(BatchNorm2d, self).__init__()
        """
        An implementation of a Batch Normalization over a mini-batch of 2D inputs.

        The mean and standard-deviation are calculated per-dimension over the
        mini-batches and gamma and beta are learnable parameter vectors of
        size num_features.

        Parameters:
        - num_features: C from an expected input of size (N, C, H, W).
        - eps: a value added to the denominator for numerical stability. Default: 1e-5
        - momentum: momentum – the value used for the running_mean and running_var
        computation. Default: 0.1
        - gamma: the learnable weights of shape (num_features).
        - beta: the learnable bias of the module of shape (num_features).
        """
        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum
        # intialise gamma and beta with standard normal distribution
        self.gamma = torch.randn(num_features)
        self.beta = torch.randn(num_features)
        # please comment out the next two lines if nn.Parameter() is not needed
        self.gamma = nn.Parameter(self.gamma)
        self.beta = nn.Parameter(self.beta)
        
        # keeps track of the running mean and variance
        self.running_mean = torch.zeros(num_features)
        self.running_var = torch.zeros(num_features)
        
        # a flag to show whether the model is training or evaluating
        self.train = True

    def forward(self, x):
        """
        During training this layer keeps running estimates of its computed mean and
        variance, which are then used for normalization during evaluation.
        Input:
        - x: Input data of shape (N, C, H, W)
        Output:
        - out: Output data of shape (N, C, H, W) (same shape as input)
        """
        if self.train: # during training
            # find the mean and variance across the minibatches
            x_mean = torch.mean(x, dim = (0,2,3), keepdim=True)
            x_var = torch.var(x, dim = (0,2,3), keepdim=True, unbiased=False)

            gamma = self.gamma.view(1, self.num_features, 1, 1)
            beta = self.beta.view(1, self.num_features, 1, 1)
            # normalise across the minibatches
            out = ((x-x_mean)/((x_var+self.eps)**0.5))*gamma + beta
            
            # update the running mean and variance
            x_mean = x_mean.view(self.num_features)
            x_var = x_var.view(self.num_features)
            self.running_mean = self.running_mean*self.momentum + x_mean*(1-self.momentum)
            self.running_var = self.running_var*self.momentum + x_var*(1-self.momentum)
        
        
        else: # during evaluation
            gamma = self.gamma.view(1, self.num_features, 1, 1)
            beta = self.beta.view(1, self.num_features, 1, 1)
            # use running mean and variance for normalisation
            x_mean = self.running_mean.view(1, self.num_features, 1, 1)
            x_var = self.running_var.view(1, self.num_features, 1, 1)
            out = ((x-x_mean)/((x_var+self.eps)**0.5))*gamma + beta

        return out