In [1]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms


part a) Accuracy of relu is better than sigmoid in both training and testing<br>
As there is non saturation of gradients in relu and in sigmoid they get saturated, relu is sparse and has induced regularization. Relu runs faster than sigmoid as it is simpler to compute. 

In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [3]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test( model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nAverage loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [4]:
use_cuda = torch.cuda.is_available()    
torch.manual_seed(1)

device = torch.device("cuda" if use_cuda else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=64, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=1000, shuffle=True, **kwargs)

In [5]:
epochs = 5

In [6]:
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=00.01, momentum=0.5)

for epoch in range(1, epochs + 1):
    train(model, device, train_loader, optimizer, epoch)
    print("Train set")
    test(model, device, train_loader)
    print("Test set")
    test(model, device, test_loader)

Train set

Average loss: 0.1108, Accuracy: 57968/60000 (97%)

Test set

Average loss: 0.1012, Accuracy: 9670/10000 (97%)



Train set

Average loss: 0.0641, Accuracy: 58844/60000 (98%)

Test set

Average loss: 0.0568, Accuracy: 9835/10000 (98%)

Train set

Average loss: 0.0526, Accuracy: 59041/60000 (98%)

Test set

Average loss: 0.0492, Accuracy: 9851/10000 (99%)



Train set

Average loss: 0.0375, Accuracy: 59349/60000 (99%)

Test set

Average loss: 0.0378, Accuracy: 9879/10000 (99%)

Train set

Average loss: 0.0382, Accuracy: 59315/60000 (99%)

Test set

Average loss: 0.0408, Accuracy: 9867/10000 (99%)



4a) sigmoid

In [7]:
class SigmoidNet(nn.Module):
    def __init__(self):
        super(SigmoidNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)

    def forward(self, x):
        x = F.sigmoid(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.sigmoid(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.sigmoid(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

model = SigmoidNet().to(device)
optimizer = optim.SGD(model.parameters(), lr=00.01, momentum=0.5)

for epoch in range(1, epochs + 1):
    train(model, device, train_loader, optimizer, epoch)
    print("Train set")
    test(model, device, train_loader)
    print("Test set")
    test(model, device, test_loader)





Train set

Average loss: 2.2983, Accuracy: 6265/60000 (10%)

Test set

Average loss: 2.2980, Accuracy: 1028/10000 (10%)



Train set

Average loss: 2.2663, Accuracy: 5974/60000 (10%)

Test set

Average loss: 2.2636, Accuracy: 989/10000 (10%)

Train set

Average loss: 1.6059, Accuracy: 33256/60000 (55%)

Test set

Average loss: 1.5928, Accuracy: 5615/10000 (56%)



Train set

Average loss: 0.7353, Accuracy: 47584/60000 (79%)

Test set

Average loss: 0.7196, Accuracy: 7974/10000 (80%)

Train set

Average loss: 0.5156, Accuracy: 50985/60000 (85%)

Test set

Average loss: 0.5028, Accuracy: 8529/10000 (85%)



part b)<br>
0.25:<br>
Train set<br>

Average loss: 0.0426, Accuracy: 59453/60000 (99%)<br>

Test set<br>

Average loss: 0.0416, Accuracy: 9914/10000 (99%)<br>

0.5<br>
Average loss: 0.1161, Accuracy: 59428/60000 (99%)<br>

Test set<br>

Average loss: 0.1120, Accuracy: 9895/10000 (99%)<br>

0.75<br>
Train set<br>

Average loss: 0.4457, Accuracy: 59102/60000 (99%)<br>

Test set<br>

Average loss: 0.4329, Accuracy: 9869/10000 (99%)<br>

1<br>
Train set<br>

Average loss: 2.3013, Accuracy: 6661/60000 (11%)<br>

Test set<br>

Average loss: 2.3018, Accuracy: 1128/10000 (11%)<br>

Error is high at probability = 1 this is expected as all the weights get dropped. Dropout increases variances of the model and if we increase the dropout too much the variances increases a lot, the model becomes less complex and is not able to learn properly. This all happens after a thresold. The accuracy of the model decreases as dropout increases. Best dropout setting is 0.25

In [10]:
class DropoutReluNet(nn.Module):
    def __init__(self, P):
        super(DropoutReluNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)
        self.dropout = nn.Dropout(p= P)
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.dropout(x)
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = self.dropout(x)
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [11]:
for P in [.25, .5, .75, 1]:
    print("dropout probability = ", P) 
    model = DropoutReluNet(P).to(device)
    optimizer = optim.SGD(model.parameters(), lr=00.01, momentum=0.5)

    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, optimizer, epoch)
        print("Train set")
        test(model, device, train_loader)
        print("Test set")
        test(model, device, test_loader)

dropout probability =  0.25
Train set

Average loss: 0.1384, Accuracy: 58394/60000 (97%)

Test set

Average loss: 0.1297, Accuracy: 9764/10000 (98%)



Train set

Average loss: 0.0946, Accuracy: 58902/60000 (98%)

Test set

Average loss: 0.0903, Accuracy: 9817/10000 (98%)

Train set

Average loss: 0.0648, Accuracy: 59160/60000 (99%)

Test set

Average loss: 0.0608, Accuracy: 9875/10000 (99%)



Train set

Average loss: 0.0532, Accuracy: 59271/60000 (99%)

Test set

Average loss: 0.0502, Accuracy: 9881/10000 (99%)

Train set

Average loss: 0.0426, Accuracy: 59453/60000 (99%)

Test set

Average loss: 0.0416, Accuracy: 9914/10000 (99%)

dropout probability =  0.5


Train set

Average loss: 0.2951, Accuracy: 58471/60000 (97%)

Test set

Average loss: 0.2795, Accuracy: 9752/10000 (98%)



Train set

Average loss: 0.1958, Accuracy: 58978/60000 (98%)

Test set

Average loss: 0.1851, Accuracy: 9837/10000 (98%)

Train set

Average loss: 0.1699, Accuracy: 59160/60000 (99%)

Test set

Average loss: 0.1620, Accuracy: 9872/10000 (99%)



Train set

Average loss: 0.1553, Accuracy: 59278/60000 (99%)

Test set

Average loss: 0.1493, Accuracy: 9888/10000 (99%)

Train set

Average loss: 0.1161, Accuracy: 59428/60000 (99%)

Test set

Average loss: 0.1120, Accuracy: 9895/10000 (99%)

dropout probability =  0.75


Train set

Average loss: 0.7089, Accuracy: 57875/60000 (96%)

Test set

Average loss: 0.6897, Accuracy: 9677/10000 (97%)

Train set

Average loss: 0.5911, Accuracy: 58482/60000 (97%)

Test set

Average loss: 0.5743, Accuracy: 9773/10000 (98%)



Train set

Average loss: 0.5076, Accuracy: 58824/60000 (98%)

Test set

Average loss: 0.4915, Accuracy: 9824/10000 (98%)



Train set

Average loss: 0.4931, Accuracy: 59013/60000 (98%)

Test set

Average loss: 0.4802, Accuracy: 9850/10000 (98%)

Train set

Average loss: 0.4457, Accuracy: 59102/60000 (99%)

Test set

Average loss: 0.4329, Accuracy: 9869/10000 (99%)

dropout probability =  1


Train set

Average loss: 2.3013, Accuracy: 7329/60000 (12%)

Test set

Average loss: 2.3017, Accuracy: 1234/10000 (12%)

Train set

Average loss: 2.3009, Accuracy: 8051/60000 (13%)

Test set

Average loss: 2.3015, Accuracy: 1363/10000 (14%)



Train set

Average loss: 2.3011, Accuracy: 7627/60000 (13%)

Test set

Average loss: 2.3017, Accuracy: 1280/10000 (13%)



Train set

Average loss: 2.3010, Accuracy: 7902/60000 (13%)

Test set

Average loss: 2.3014, Accuracy: 1328/10000 (13%)

Train set

Average loss: 2.3013, Accuracy: 6661/60000 (11%)

Test set

Average loss: 2.3018, Accuracy: 1128/10000 (11%)



partc)Batch norm + dropout(0.25):<br>
Train set<br>

Average loss: 0.0279, Accuracy: 59598/60000 (99%)<br>

Test set<br>

Average loss: 0.0318, Accuracy: 9915/10000 (99%)<br>

Only Batch norm:<br>
Test set:<br>
Average loss: 0.0146, Accuracy: 59806/60000 (100%)<br>

Test set<br>

Average loss: 0.0287, Accuracy: 9912/10000 (99%)<br>

Both are performing nearly same as both are regularization methods and they each others job, in many papers
they claim that one is the same as other but a lot of researchers use both as they tend to give slightly 
better performance.Both act as a regularizer, when they are used together slightly better regularization
happens as the noise/randomness introduced is from 2 different sources.



In [16]:
class BNDropoutReluNet(nn.Module):
    def __init__(self, P):
        super(BNDropoutReluNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv1_bn = nn.BatchNorm2d(20)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.conv2_bn = nn.BatchNorm2d(50)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc1_bn = nn.BatchNorm1d(500)
        self.fc2 = nn.Linear(500, 10)
        self.dropout = nn.Dropout(p= P)
    def forward(self, x):
        x = self.conv1_bn(F.relu(self.conv1(x)))
        x = self.dropout(x)
        x = F.max_pool2d(x, 2, 2)
        x = self.conv2_bn(F.relu(self.conv2(x)))
        x = self.dropout(x)
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = self.fc1_bn(F.relu(self.fc1(x)))
        x = self.dropout(x) 
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [17]:
model = BNDropoutReluNet(0.25).to(device)
optimizer = optim.SGD(model.parameters(), lr=00.01, momentum=0.5)

for epoch in range(1, epochs + 1):
    train(model, device, train_loader, optimizer, epoch)
    print("Train set")
    test(model, device, train_loader)
    print("Test set")
    test(model, device, test_loader)

Train set

Average loss: 0.0812, Accuracy: 58899/60000 (98%)

Test set

Average loss: 0.0760, Accuracy: 9809/10000 (98%)



Train set

Average loss: 0.0487, Accuracy: 59331/60000 (99%)

Test set

Average loss: 0.0467, Accuracy: 9890/10000 (99%)

Train set

Average loss: 0.0382, Accuracy: 59443/60000 (99%)

Test set

Average loss: 0.0389, Accuracy: 9894/10000 (99%)



Train set

Average loss: 0.0330, Accuracy: 59492/60000 (99%)

Test set

Average loss: 0.0363, Accuracy: 9898/10000 (99%)

Train set

Average loss: 0.0279, Accuracy: 59598/60000 (99%)

Test set

Average loss: 0.0318, Accuracy: 9915/10000 (99%)



In [18]:
class BNNet(nn.Module):
    def __init__(self):
        super(BNNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv1_bn = nn.BatchNorm2d(20)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.conv2_bn = nn.BatchNorm2d(50)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc1_bn = nn.BatchNorm1d(500)
        self.fc2 = nn.Linear(500, 10)
    def forward(self, x):
        x = self.conv1_bn(F.relu(self.conv1(x)))
        x = F.max_pool2d(x, 2, 2)
        x = self.conv2_bn(F.relu(self.conv2(x)))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = self.fc1_bn(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [19]:
model = BNNet().to(device)
optimizer = optim.SGD(model.parameters(), lr=00.01, momentum=0.5)

for epoch in range(1, epochs + 1):
    train(model, device, train_loader, optimizer, epoch)
    print("Train set")
    test(model, device, train_loader)
    print("Test set")
    test(model, device, test_loader)   

Train set

Average loss: 0.0538, Accuracy: 59215/60000 (99%)

Test set

Average loss: 0.0525, Accuracy: 9859/10000 (99%)



Train set

Average loss: 0.0312, Accuracy: 59564/60000 (99%)

Test set

Average loss: 0.0349, Accuracy: 9897/10000 (99%)

Train set

Average loss: 0.0234, Accuracy: 59681/60000 (99%)

Test set

Average loss: 0.0322, Accuracy: 9899/10000 (99%)



Train set

Average loss: 0.0170, Accuracy: 59773/60000 (100%)

Test set

Average loss: 0.0276, Accuracy: 9917/10000 (99%)

Train set

Average loss: 0.0146, Accuracy: 59806/60000 (100%)

Test set

Average loss: 0.0287, Accuracy: 9912/10000 (99%)



part 4d)<br>
BatchNormalization does help in case of xavier, test error for non batch is 98% and for batch is 99%, batch 
normalization acts a regularizer which improves accuracy. Both and xavier and kaiming give nearly same test error kaiming gives really slightly better accuracy. Kaiming takes into account the activation function while xavier doesnt.


In [20]:
class BNXavierNet(nn.Module):
    def __init__(self):
        super(BNXavierNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        nn.init.xavier_normal_(self.conv1.weight)
        self.conv1_bn = nn.BatchNorm2d(20)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        nn.init.xavier_normal_(self.conv2.weight)
        self.conv2_bn = nn.BatchNorm2d(50)
        self.fc1 = nn.Linear(4*4*50, 500)
        nn.init.xavier_normal_(self.fc1.weight)
        self.fc1_bn = nn.BatchNorm1d(500)
        self.fc2 = nn.Linear(500, 10)
        nn.init.xavier_normal_(self.fc2.weight)
    def forward(self, x):
        x = self.conv1_bn(F.relu(self.conv1(x)))
        x = F.max_pool2d(x, 2, 2)
        x = self.conv2_bn(F.relu(self.conv2(x)))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = self.fc1_bn(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [21]:
model = BNXavierNet().to(device)
optimizer = optim.SGD(model.parameters(), lr=00.01, momentum=0.5)

for epoch in range(1, epochs + 1):
    train(model, device, train_loader, optimizer, epoch)
    print("Train set")
    test(model, device, train_loader)
    print("Test set")
    test(model, device, test_loader)   

Train set

Average loss: 0.0538, Accuracy: 59163/60000 (99%)

Test set

Average loss: 0.0554, Accuracy: 9854/10000 (99%)



Train set

Average loss: 0.0341, Accuracy: 59468/60000 (99%)

Test set

Average loss: 0.0401, Accuracy: 9881/10000 (99%)

Train set

Average loss: 0.0272, Accuracy: 59622/60000 (99%)

Test set

Average loss: 0.0355, Accuracy: 9898/10000 (99%)



Train set

Average loss: 0.0217, Accuracy: 59679/60000 (99%)

Test set

Average loss: 0.0329, Accuracy: 9901/10000 (99%)

Train set

Average loss: 0.0228, Accuracy: 59666/60000 (99%)

Test set

Average loss: 0.0362, Accuracy: 9891/10000 (99%)



In [22]:
class BNkaimingNet(nn.Module):
    def __init__(self):
        super(BNkaimingNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        nn.init.kaiming_normal_(self.conv1.weight)
        self.conv1_bn = nn.BatchNorm2d(20)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        nn.init.kaiming_normal_(self.conv2.weight)
        self.conv2_bn = nn.BatchNorm2d(50)
        self.fc1 = nn.Linear(4*4*50, 500)
        nn.init.kaiming_normal_(self.fc1.weight)
        self.fc1_bn = nn.BatchNorm1d(500)
        self.fc2 = nn.Linear(500, 10)
        nn.init.kaiming_normal_(self.fc2.weight)
    def forward(self, x):
        x = self.conv1_bn(F.relu(self.conv1(x)))
        x = F.max_pool2d(x, 2, 2)
        x = self.conv2_bn(F.relu(self.conv2(x)))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = self.fc1_bn(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [23]:
model = BNkaimingNet().to(device)
optimizer = optim.SGD(model.parameters(), lr=00.01, momentum=0.5)

for epoch in range(1, epochs + 1):
    train(model, device, train_loader, optimizer, epoch)
    print("Train set")
    test(model, device, train_loader)
    print("Test set")
    test(model, device, test_loader)   

Train set

Average loss: 0.0835, Accuracy: 58680/60000 (98%)

Test set

Average loss: 0.0821, Accuracy: 9783/10000 (98%)



Train set

Average loss: 0.0533, Accuracy: 59165/60000 (99%)

Test set

Average loss: 0.0558, Accuracy: 9835/10000 (98%)

Train set

Average loss: 0.0405, Accuracy: 59363/60000 (99%)

Test set

Average loss: 0.0467, Accuracy: 9865/10000 (99%)



Train set

Average loss: 0.0322, Accuracy: 59518/60000 (99%)

Test set

Average loss: 0.0412, Accuracy: 9887/10000 (99%)

Train set

Average loss: 0.0264, Accuracy: 59616/60000 (99%)

Test set

Average loss: 0.0374, Accuracy: 9897/10000 (99%)



In [25]:
class XavierNet(nn.Module):
    def __init__(self):
        super(XavierNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        nn.init.xavier_normal_(self.conv1.weight)
#         self.conv1_bn = nn.BatchNorm2d(20)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        nn.init.xavier_normal_(self.conv2.weight)
#         self.conv2_bn = nn.BatchNorm2d(50)
        self.fc1 = nn.Linear(4*4*50, 500)
        nn.init.xavier_normal_(self.fc1.weight)
#         self.fc1_bn = nn.BatchNorm1d(500)
        self.fc2 = nn.Linear(500, 10)
        nn.init.xavier_normal_(self.fc2.weight)
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [26]:
model = XavierNet().to(device)
optimizer = optim.SGD(model.parameters(), lr=00.01, momentum=0.5)

for epoch in range(1, epochs + 1):
    train(model, device, train_loader, optimizer, epoch)
    print("Train set")
    test(model, device, train_loader)
    print("Test set")
    test(model, device, test_loader)   

Train set

Average loss: 0.1251, Accuracy: 57549/60000 (96%)

Test set

Average loss: 0.1159, Accuracy: 9607/10000 (96%)



Train set

Average loss: 0.0579, Accuracy: 58919/60000 (98%)

Test set

Average loss: 0.0557, Accuracy: 9815/10000 (98%)

Train set

Average loss: 0.0429, Accuracy: 59241/60000 (99%)

Test set

Average loss: 0.0475, Accuracy: 9845/10000 (98%)



Train set

Average loss: 0.0340, Accuracy: 59385/60000 (99%)

Test set

Average loss: 0.0365, Accuracy: 9884/10000 (99%)

Train set

Average loss: 0.0402, Accuracy: 59249/60000 (99%)

Test set

Average loss: 0.0557, Accuracy: 9817/10000 (98%)

