In [1]:
import time 

import torch 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim 

* Even more important is the ability to design networks where adding layers makes networks strictly more expressive rather than just different.

* ```Residual_Blocks```
* ResNet follows VGG’s full 3 × 3 convolutional layer design. The residual block has two 3 × 3 convolutional layers with the same number of output channels.
* Each convolutional layer is followed by a batch normal- ization layer and a ReLU activation function. Then, we skip these two convolution operations and add the input directly before the final ReLU activation function.
* This kind of design requires that the output of the two convolutional layers be of the same shape as the input, so that they can be added together.
* If we want to change the number of channels or the the stride, we need to introduce an additional 1 × 1 convolutional layer to transform the input into the desired shape for the addition operation.

* This code generates two types of networks:
    * one where we add the input to the output before applying the ReLU nonlinearity
    * whenever ```use_1x1conv=True```, one where we adjust channels and resolution by means of a 1 × 1 convolution before adding.

In [2]:
class Residual(nn.Module):
  
  def __init__(self,input_channels, num_channels, use_1x1conv=False, strides=1, **kwargs):
    super(Residual, self).__init__(**kwargs)
    self.conv1 = nn.Conv2d(input_channels, num_channels,kernel_size=3, padding=1, stride=strides)
    self.conv2 = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=1)
    if use_1x1conv:
        self.conv3 = nn.Conv2d(input_channels, num_channels, kernel_size=1, stride=strides)
    else:
        self.conv3 = None
    self.bn1 = nn.BatchNorm2d(num_channels)
    self.bn2 = nn.BatchNorm2d(num_channels)
    self.relu = nn.ReLU(inplace=True)
  
  def forward(self, X):
    
    Y = self.relu(self.bn1(self.conv1(X)))
    Y = self.bn2(self.conv2(Y))
    if self.conv3:
        X = self.conv3(X)
    Y += X
    Y =self.relu(Y)
    return Y

* let's look at a situation where the input and output are of the same shape.
    * Input  :(4,3,6,6) shape 
    * Output :(4,3,6,6) shape  

In [3]:
blk = Residual(3,3)
X = torch.rand(4, 3, 6, 6)

Y = blk(X)
Y.shape

torch.Size([4, 3, 6, 6])

* We also have the option to halve the output height and width while increasing the number of output channels.

In [4]:
blk = Residual(3,6, use_1x1conv=True, strides=2)
blk(X).shape

torch.Size([4, 6, 3, 3])

***

* ```ResNet Model``` 
* The first two layers of ResNet are the same as those of the GoogLeNet we described before: 
    * 7x7 Conv,s=2, p=3, #64 
    * BN 
    * ReLU
    * 3x3 MaxPool, s=2, p=1 

In [5]:
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
                    nn.BatchNorm2d(64),
                    nn.ReLU(),
                    nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

* ResNet uses four modules made up of residual blocks, each of which uses several residual blocks with the same number of output channels.

In [6]:
def resnet_block(input_channels, num_channels, num_residuals, first_block=False):
    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.append(Residual(input_channels, num_channels, use_1x1conv=True, strides=2))
        else:
            blk.append(Residual(num_channels, num_channels))
    return blk

* Then, we add all the residual blocks to ResNet. Here, two residual blocks are used for each module.

In [7]:
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)

* There are 4 convolutional layers in each module (excluding the 1 × 1 convolutional layer).
* Together with the first convolutional layer and the final fully connected layer, there are 18 layers in total.
    * this model is commonly known as ```ResNet-18```.

In [8]:
b2=nn.Sequential(*resnet_block(64,64,2,first_block=True))
b3=nn.Sequential(*resnet_block(64,128,2))
b4=nn.Sequential(*resnet_block(128,256,2))
b5=nn.Sequential(*resnet_block(256,512,2))

net=nn.Sequential(b1,b2,b3,b4,b5,nn.AdaptiveMaxPool2d((1,1)),Flatten(),nn.Linear(512, 10))

* By configuring different numbers of channels and residual blocks in the module, we can create different ResNet models, such as the deeper 152-layer ```ResNet-152```.
* Although the main architecture of ResNet is similar to that of GoogLeNet, ResNet’s structure is simpler and easier to modify.
    * All these factors have resulted in the rapid and widespread use of ResNet.

***
* Check the model design 
* To have a reasonable training time on Fashion-MNIST, we reduce the input height and width from 224 to 96

In [9]:
X = torch.rand(size=(1, 1, 96, 96))   # [B, C, H, W]

for layer in net:
    X = layer(X)
    print(layer.__class__.__name__,'output shape:\t', X.shape)

Sequential output shape:	 torch.Size([1, 64, 24, 24])
Sequential output shape:	 torch.Size([1, 64, 24, 24])
Sequential output shape:	 torch.Size([1, 128, 12, 12])
Sequential output shape:	 torch.Size([1, 256, 6, 6])
Sequential output shape:	 torch.Size([1, 512, 3, 3])
AdaptiveMaxPool2d output shape:	 torch.Size([1, 512, 1, 1])
Flatten output shape:	 torch.Size([1, 512])
Linear output shape:	 torch.Size([1, 10])


*** 
* Data Acquisition 
* Reading Data (Fashion-MNIST)
* Preprocess: Fashion-MNIST has 28x28 pixels -> upsample them to 96x96

In [10]:
import sys 
import os 

import torchvision 
from torchvision import transforms 
from torch.utils.data import DataLoader 

def load_data_fashion_mnist(batch_size, resize=None, root=os.path.join(os.getcwd(), 'datasets', 'fashion-mnist')):
    """Download the Fashion-MNIST dataset and then load into memory."""
    root = os.path.expanduser(root)
    transformer = []
    if resize:
        transformer += [transforms.Resize(resize)]
    transformer += [transforms.ToTensor()]
    transformer = transforms.Compose(transformer)

    mnist_train = torchvision.datasets.FashionMNIST(root=root, train=True, transform=transformer, download=True)
    mnist_test = torchvision.datasets.FashionMNIST(root=root, train=False, transform=transformer, download=True)
    num_workers = 0 if sys.platform.startswith('win32') else 4

    train_iter = DataLoader(mnist_train, batch_size, shuffle=True, num_workers=num_workers)
    test_iter = DataLoader(mnist_test, batch_size, shuffle=False, num_workers=num_workers)
    return train_iter, test_iter

In [11]:
batch_size = 128

#Loading Fashion-MNIST Dataset
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=96)

***
* Model Training 

In [12]:
def try_gpu():
    """If GPU is available, return torch.device as cuda:0; else return torch.device as cpu."""
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
    return device

In [13]:
def evaluate_accuracy(data_iter, net, device=torch.device('cpu')):
    """Evaluate accuracy of a model on the given data set."""
    net.eval()  # Switch to evaluation mode for Dropout, BatchNorm etc layers.
    acc_sum, n = torch.tensor([0], dtype=torch.float32, device=device), 0
    for X, y in data_iter:
        # Copy the data to device.
        X, y = X.to(device), y.to(device)
        with torch.no_grad():
            y = y.long()
            acc_sum += torch.sum((torch.argmax(net(X), dim=1) == y))
            n += y.shape[0]
    return acc_sum.item()/n

In [14]:
def train_ch5(net, train_iter, test_iter, criterion, num_epochs, batch_size, device, lr=None):
    """Train and evaluate a model with CPU or GPU."""
    print('training on', device)
    net.to(device)
    optimizer = optim.SGD(net.parameters(), lr=lr)
    for epoch in range(num_epochs):
        net.train() # Switch to training mode
        n, start = 0, time.time()
        train_l_sum = torch.tensor([0.0], dtype=torch.float32, device=device)
        train_acc_sum = torch.tensor([0.0], dtype=torch.float32, device=device)
        for X, y in train_iter:
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device) 
            y_hat = net(X)
            loss = criterion(y_hat, y)
            loss.backward()
            optimizer.step()
            with torch.no_grad():
                y = y.long()
                train_l_sum += loss.float()
                train_acc_sum += (torch.sum((torch.argmax(y_hat, dim=1) == y))).float()
                n += y.shape[0]

        test_acc = evaluate_accuracy(test_iter, net, device) 
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'\
            % (epoch + 1, train_l_sum/n, train_acc_sum/n, test_acc, time.time() - start))

In [15]:
lr, num_epochs, batch_size, device =  0.05, 5, 256, try_gpu()

#Xavier initialization of weights
def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        torch.nn.init.xavier_uniform_(m.weight)

In [16]:
net.apply(init_weights)
net = net.to(device)

In [17]:
#Loss Function Criterion
criterion = nn.CrossEntropyLoss()

train_ch5(net, train_iter, test_iter, criterion, num_epochs, batch_size, device, lr)

training on cuda:0
epoch 1, loss 0.0084, train acc 0.765, test acc 0.862, time 25.2 sec
epoch 2, loss 0.0025, train acc 0.880, test acc 0.888, time 25.8 sec
epoch 3, loss 0.0020, train acc 0.907, test acc 0.892, time 26.0 sec
epoch 4, loss 0.0019, train acc 0.911, test acc 0.873, time 26.0 sec
epoch 5, loss 0.0015, train acc 0.930, test acc 0.910, time 25.6 sec
