# Multi-Layer Perceptron

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import torch
import torchvision as tv
import numpy as np
import pandas as pd
import time

## Datas

In [3]:
BATCH_SIZE = 256

In [4]:
# download MNIST datas
train_dataset = tv.datasets.MNIST('.', train=True, transform=tv.transforms.ToTensor(), download=True)
test_dataset = tv.datasets.MNIST('.', train=False, transform=tv.transforms.ToTensor(), download=True)

In [5]:
train_dataset

Dataset MNIST
    Number of datapoints: 60000
    Root location: .
    Split: Train
    StandardTransform
Transform: ToTensor()

In [6]:
test_dataset

Dataset MNIST
    Number of datapoints: 10000
    Root location: .
    Split: Test
    StandardTransform
Transform: ToTensor()

In [7]:
train = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE)
test = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [8]:
train

<torch.utils.data.dataloader.DataLoader at 0x7ffa59d5bbe0>

In [9]:
test

<torch.utils.data.dataloader.DataLoader at 0x7ffa08e9bd60>

In [10]:
train_dataset[0][0].shape

torch.Size([1, 28, 28])

## Model SGD

In [11]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 10)
)

In [12]:
model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=256, bias=True)
  (2): ReLU()
  (3): Linear(in_features=256, out_features=10, bias=True)
)

In [13]:
loss = torch.nn.CrossEntropyLoss()
trainer = torch.optim.SGD(model.parameters(), lr=.01)
num_epochs = 10

## Train Model

In [14]:
def train_model():
    for ep in range(num_epochs):
        train_iters, train_passed = 0, 0
        train_loss, train_acc = 0., 0. 
        start=time.time()

        model.train()
        for X, y in train:
            trainer.zero_grad()
            y_pred = model(X)
            l = loss(y_pred, y)
            l.backward()
            trainer.step()
            train_loss += l.item()
            train_acc += (y_pred.argmax(dim=1) == y).sum().item()
            train_iters += 1
            train_passed += len(X)

        test_iters, test_passed = 0, 0
        test_loss, test_acc = 0., 0.
        model.eval()
        for X, y in test:
            y_pred = model(X)
            l = loss(y_pred, y)
            test_loss += l.item()
            test_acc += (y_pred.argmax(dim=1) == y).sum().item()
            test_iters += 1
            test_passed += len(X)

        print("ep: {}, taked: {:.3f}, train_loss: {}, train_acc: {}, test_loss: {}, test_acc: {}".format(
            ep, time.time() - start, train_loss / train_iters, train_acc / train_passed,
            test_loss / test_iters, test_acc / test_passed
        ))            

In [15]:
train_model()

ep: 0, taked: 14.707, train_loss: 2.0181229170332564, train_acc: 0.5749166666666666, test_loss: 1.6504723697900772, test_acc: 0.7273
ep: 1, taked: 13.311, train_loss: 1.304376049751931, train_acc: 0.7740166666666667, test_loss: 0.9878777593374253, test_acc: 0.8205
ep: 2, taked: 12.455, train_loss: 0.8506758371566204, train_acc: 0.8306333333333333, test_loss: 0.7042370028793812, test_acc: 0.8493
ep: 3, taked: 12.572, train_loss: 0.6561351518681708, train_acc: 0.8526333333333334, test_loss: 0.5734108697623015, test_acc: 0.8681
ep: 4, taked: 12.199, train_loss: 0.5573848199337087, train_acc: 0.8661833333333333, test_loss: 0.500043573230505, test_acc: 0.879
ep: 5, taked: 12.251, train_loss: 0.4983596504368681, train_acc: 0.8749166666666667, test_loss: 0.45347047634422777, test_acc: 0.8866
ep: 6, taked: 12.923, train_loss: 0.4592119977829304, train_acc: 0.8811666666666667, test_loss: 0.42140906769782305, test_acc: 0.8905
ep: 7, taked: 13.306, train_loss: 0.43133462242623594, train_acc: 0.88

## Model Adam

In [16]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 10)
)

In [17]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 15.809, train_loss: 0.25115921987181017, train_acc: 0.9245166666666667, test_loss: 0.153419107478112, test_acc: 0.9504
ep: 1, taked: 15.216, train_loss: 0.10508307482294263, train_acc: 0.9677166666666667, test_loss: 0.12418330510845407, test_acc: 0.9619
ep: 2, taked: 15.581, train_loss: 0.07366696313776551, train_acc: 0.9767333333333333, test_loss: 0.1205400757418829, test_acc: 0.963
ep: 3, taked: 15.796, train_loss: 0.05851514028147497, train_acc: 0.9814666666666667, test_loss: 0.11481115016940749, test_acc: 0.9701
ep: 4, taked: 15.415, train_loss: 0.04928975769397902, train_acc: 0.9838, test_loss: 0.10899295777126099, test_acc: 0.9719
ep: 5, taked: 15.571, train_loss: 0.052350953335259506, train_acc: 0.9838666666666667, test_loss: 0.1586362271278631, test_acc: 0.9648
ep: 6, taked: 17.223, train_loss: 0.05359427840696608, train_acc: 0.9839666666666667, test_loss: 0.12840138812025542, test_acc: 0.9706
ep: 7, taked: 15.887, train_loss: 0.047459622297777816, train_acc: 0.98

## Adding more layers

In [18]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 10)
)

In [19]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 18.982, train_loss: 0.32617686742797813, train_acc: 0.89895, test_loss: 0.17016005455516278, test_acc: 0.9499
ep: 1, taked: 17.734, train_loss: 0.1267260323278606, train_acc: 0.9630666666666666, test_loss: 0.13147492075513584, test_acc: 0.9636
ep: 2, taked: 18.193, train_loss: 0.09873220189771753, train_acc: 0.9714666666666667, test_loss: 0.13430968879838473, test_acc: 0.9652
ep: 3, taked: 17.910, train_loss: 0.08459107848875067, train_acc: 0.9754333333333334, test_loss: 0.14651140395071707, test_acc: 0.9601
ep: 4, taked: 16.558, train_loss: 0.07534584165748923, train_acc: 0.9785, test_loss: 0.3627998813055456, test_acc: 0.9442
ep: 5, taked: 16.561, train_loss: 0.08313365718151661, train_acc: 0.9775166666666667, test_loss: 0.1357090719859116, test_acc: 0.9678
ep: 6, taked: 19.245, train_loss: 0.06181825435660938, train_acc: 0.9823833333333334, test_loss: 0.14611405374016612, test_acc: 0.9709
ep: 7, taked: 18.888, train_loss: 0.05798360816539919, train_acc: 0.9837666666666

## Batch Normalisation

In [20]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(512),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(256), 
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(128),
    torch.nn.Linear(128, 10)
)

In [21]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 19.253, train_loss: 0.19999790472236084, train_acc: 0.9384333333333333, test_loss: 0.14562059161253274, test_acc: 0.9531
ep: 1, taked: 18.767, train_loss: 0.09916448968046523, train_acc: 0.9702, test_loss: 0.13282584360567853, test_acc: 0.9605
ep: 2, taked: 19.840, train_loss: 0.070722065521206, train_acc: 0.9774, test_loss: 0.10620860806229757, test_acc: 0.9674
ep: 3, taked: 19.221, train_loss: 0.05435452413685778, train_acc: 0.98255, test_loss: 0.10927800853096414, test_acc: 0.9668
ep: 4, taked: 18.571, train_loss: 0.04388375098995389, train_acc: 0.9855333333333334, test_loss: 0.09133320614055265, test_acc: 0.9746
ep: 5, taked: 19.482, train_loss: 0.034685879340890714, train_acc: 0.9885333333333334, test_loss: 0.09763758220651653, test_acc: 0.972
ep: 6, taked: 18.830, train_loss: 0.03061423451798235, train_acc: 0.9898333333333333, test_loss: 0.11142587983704288, test_acc: 0.972
ep: 7, taked: 17.356, train_loss: 0.029813203354678217, train_acc: 0.9896666666666667, test_l

## Dropout

In [22]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 2560),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.Linear(2560, 1280),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5), 
    torch.nn.Linear(1280, 640),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.Linear(640, 10)
)

In [23]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 60.655, train_loss: 0.7759189718581261, train_acc: 0.78635, test_loss: 0.30003036027774216, test_acc: 0.9209
ep: 1, taked: 63.286, train_loss: 0.6266954021250948, train_acc: 0.8402333333333334, test_loss: 0.3368370419368148, test_acc: 0.9245
ep: 2, taked: 61.792, train_loss: 0.6503023340346965, train_acc: 0.8453833333333334, test_loss: 0.32245812825858594, test_acc: 0.9261
ep: 3, taked: 64.860, train_loss: 0.6883126719200865, train_acc: 0.8360833333333333, test_loss: 0.3558458704035729, test_acc: 0.928
ep: 4, taked: 56.949, train_loss: 0.6989276050253117, train_acc: 0.8302333333333334, test_loss: 0.33052830332890154, test_acc: 0.9181
ep: 5, taked: 54.878, train_loss: 0.7217641126602254, train_acc: 0.82115, test_loss: 0.39036161210387943, test_acc: 0.9209
ep: 6, taked: 53.778, train_loss: 0.7429932450994532, train_acc: 0.81405, test_loss: 0.36661897506564856, test_acc: 0.9185
ep: 7, taked: 53.646, train_loss: 0.7445424404550106, train_acc: 0.81235, test_loss: 0.37000387199