In [1]:
import time 

import torch 
import torch.nn as nn
import torch.optim as optim 

* a careless use of dense(= Linear) layers might give up the spatial structure of the representation entirely
* Network in Network (NiN) blocks offer an alternative.

* ```NiN``` Blocks 
* The idea behind ```NiN``` is to apply a fully-connected layer at each pixel location (for each height and width).
* If we tie the weights across each spatial location, we could think of this as a $1\times 1$ convolutional layer, or as a fully-connected layer acting independently on each pixel location

***

* The ```NiN``` block consists of one convolutional layer followed by two $1\times 1$ convolutional layers that act as per-pixel fully-connected layers with ReLU activations.

* nxn Conv, #out_channels 
* ReLU
* 1x1 Conv, #out_channels
* ReLU
* 1x1 Conv, #out_channels
* ReLU

In [2]:
def nin_block(in_channels, out_channels, kernel_size, strides, padding):
    blk = nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, strides, padding),
        nn.ReLU(),
        nn.Conv2d(out_channels, out_channels, kernel_size=1), 
        nn.ReLU(),
        nn.Conv2d(out_channels, out_channels, kernel_size=1), 
        nn.ReLU(),   
        )
    return blk

***

* NiN Model 
* ```NiN``` avoids dense connections altogether to avoid overfitting
* Instead, ```NiN``` uses an ```NiN``` block with a number of output channels equal to the number of label classes, followed by a global average pooling layer, yielding a vector of logits.

***

* Advantage of NiN's design : it significantly reduces the number of required model parameters.
* Disadvantage :  in practice, this design sometimes requires increased model training time.

In [3]:
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)

* NiN_block, 
* 3x3 MaxPool, s=2 
* NiN_block, 
* 3x3 MaxPool, s=2 
* NiN_block, 
* 3x3 MaxPool, s=2 
* Dropout, 50% 
* NiN_block, 
* AdaptiveMaxPool 
* Flatten 

In [4]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.n1 = nin_block(1,out_channels=96, kernel_size=11, strides=4, padding=0)
        self.m1 = nn.MaxPool2d(3,stride=2)
        self.n2 = nin_block(96,out_channels=256, kernel_size=5, strides=1, padding=2)
        self.m2 = nn.MaxPool2d(3,stride=2)
        self.n3 = nin_block(256,out_channels=384, kernel_size=3, strides=1, padding=1)
        self.m3 = nn.MaxPool2d(3,stride=2)
        self.dropout1 = nn.Dropout2d(0.5)
        self.n4 = nin_block(384,out_channels=10, kernel_size=3, strides=1, padding=1)
        #Global Average Pooling can be achieved by AdaptiveMaxPool2d with output size = (1,1)
        self.avg1 = nn.AdaptiveMaxPool2d((1,1))
        self.flat = Flatten()
        
    def forward(self, x): 
        x = self.m1(self.n1(x))
        x = self.m2(self.n2(x))
        x = self.dropout1(self.m3(self.n3(x)))
        x = self.n4(x)
        x = self.avg1(x)
        x = self.flat(x) 
        return x

In [5]:
net = Net()

***
* Create a data example to see the output shape of each block.

In [6]:
X = torch.rand(size=(1,1,224,224))  # [B, C, H, W]

for layer in net.children():
    X = layer(X)
    print(layer.__class__.__name__,'output shape:\t', X.shape)

Sequential output shape:	 torch.Size([1, 96, 54, 54])
MaxPool2d output shape:	 torch.Size([1, 96, 26, 26])
Sequential output shape:	 torch.Size([1, 256, 26, 26])
MaxPool2d output shape:	 torch.Size([1, 256, 12, 12])
Sequential output shape:	 torch.Size([1, 384, 12, 12])
MaxPool2d output shape:	 torch.Size([1, 384, 5, 5])
Dropout2d output shape:	 torch.Size([1, 384, 5, 5])
Sequential output shape:	 torch.Size([1, 10, 5, 5])
AdaptiveMaxPool2d output shape:	 torch.Size([1, 10, 1, 1])
Flatten output shape:	 torch.Size([1, 10])


***

* Data Acquisition 
* Reading Data (Fashion-MNIST)
* Preprocess: Fashion-MNIST has 28x28 pixels -> upsample them to 244x244

In [7]:
import sys 
import os 

import torchvision 
from torchvision import transforms 
from torch.utils.data import DataLoader 

def load_data_fashion_mnist(batch_size, resize=None, root=os.path.join(os.getcwd(), 'datasets', 'fashion-mnist')):
    """Download the Fashion-MNIST dataset and then load into memory."""
    root = os.path.expanduser(root)
    transformer = []
    if resize:
        transformer += [transforms.Resize(resize)]
    transformer += [transforms.ToTensor()]
    transformer = transforms.Compose(transformer)

    mnist_train = torchvision.datasets.FashionMNIST(root=root, train=True, transform=transformer, download=True)
    mnist_test = torchvision.datasets.FashionMNIST(root=root, train=False, transform=transformer, download=True)
    num_workers = 0 if sys.platform.startswith('win32') else 4

    train_iter = DataLoader(mnist_train, batch_size, shuffle=True, num_workers=num_workers)
    test_iter = DataLoader(mnist_test, batch_size, shuffle=False, num_workers=num_workers)
    return train_iter, test_iter

In [8]:
batch_size = 128
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=224)

***

* Model Training 

In [9]:
def try_gpu():
    """If GPU is available, return torch.device as cuda:0; else return torch.device as cpu."""
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
    return device

In [10]:
def evaluate_accuracy(data_iter, net, device=torch.device('cpu')):
    """Evaluate accuracy of a model on the given data set."""
    net.eval()  # Switch to evaluation mode for Dropout, BatchNorm etc layers.
    acc_sum, n = torch.tensor([0], dtype=torch.float32, device=device), 0
    for X, y in data_iter:
        # Copy the data to device.
        X, y = X.to(device), y.to(device)
        with torch.no_grad():
            y = y.long()
            acc_sum += torch.sum((torch.argmax(net(X), dim=1) == y))
            n += y.shape[0]
    return acc_sum.item()/n

In [11]:
def train_ch5(net, train_iter, test_iter, criterion, num_epochs, batch_size, device, lr=None):
    """Train and evaluate a model with CPU or GPU."""
    print('training on', device)
    net.to(device)
    optimizer = optim.SGD(net.parameters(), lr=lr)
    for epoch in range(num_epochs):
        net.train() # Switch to training mode
        n, start = 0, time.time()
        train_l_sum = torch.tensor([0.0], dtype=torch.float32, device=device)
        train_acc_sum = torch.tensor([0.0], dtype=torch.float32, device=device)
        for X, y in train_iter:
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device) 
            y_hat = net(X)
            loss = criterion(y_hat, y)
            loss.backward()
            optimizer.step()
            with torch.no_grad():
                y = y.long()
                train_l_sum += loss.float()
                train_acc_sum += (torch.sum((torch.argmax(y_hat, dim=1) == y))).float()
                n += y.shape[0]

        test_acc = evaluate_accuracy(test_iter, net, device) 
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'\
            % (epoch + 1, train_l_sum/n, train_acc_sum/n, test_acc, time.time() - start))

In [12]:
lr, num_epochs, batch_size, device =  0.1, 5, 128, try_gpu()

#Xavier initialization of weights
def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        torch.nn.init.xavier_uniform_(m.weight)

In [13]:
net.apply(init_weights)
net = net.to(device)

In [14]:
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=224)

criterion = nn.CrossEntropyLoss()

train_ch5(net, train_iter, test_iter, criterion, num_epochs, batch_size, device, lr)

training on cuda:0
epoch 1, loss 0.0166, train acc 0.222, test acc 0.460, time 41.5 sec
epoch 2, loss 0.0078, train acc 0.640, test acc 0.750, time 41.6 sec
epoch 3, loss 0.0048, train acc 0.765, test acc 0.765, time 41.9 sec
epoch 4, loss 0.0041, train acc 0.802, test acc 0.824, time 42.1 sec
epoch 5, loss 0.0037, train acc 0.821, test acc 0.843, time 42.1 sec
