# Multi-Layer Perceptron

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import torch
import torchvision as tv
import numpy as np
import pandas as pd
import time

## Datas

In [8]:
BATCH_SIZE = 256

In [5]:
# download MNIST datas
train_dataset = tv.datasets.MNIST('.', train=True, transform=tv.transforms.ToTensor(), download=True)
test_dataset = tv.datasets.MNIST('.', train=False, transform=tv.transforms.ToTensor(), download=True)

In [6]:
train_dataset

Dataset MNIST
    Number of datapoints: 60000
    Root location: .
    Split: Train
    StandardTransform
Transform: ToTensor()

In [7]:
test_dataset

Dataset MNIST
    Number of datapoints: 10000
    Root location: .
    Split: Test
    StandardTransform
Transform: ToTensor()

In [9]:
train = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE)
test = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [10]:
train

<torch.utils.data.dataloader.DataLoader at 0x7f2a57de3850>

In [11]:
test

<torch.utils.data.dataloader.DataLoader at 0x7f2a57de19c0>

In [13]:
train_dataset[0][0].shape

torch.Size([1, 28, 28])

## Model SGD

In [14]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 10)
)

In [15]:
model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=256, bias=True)
  (2): ReLU()
  (3): Linear(in_features=256, out_features=10, bias=True)
)

In [16]:
loss = torch.nn.CrossEntropyLoss()
trainer = torch.optim.SGD(model.parameters(), lr=.01)
num_epochs = 10

## Train Model

In [17]:
def train_model():
    for ep in range(num_epochs):
        train_iters, train_passed = 0, 0
        train_loss, train_acc = 0., 0. 
        start=time.time()

        model.train()
        for X, y in train:
            trainer.zero_grad()
            y_pred = model(X)
            l = loss(y_pred, y)
            l.backward()
            trainer.step()
            train_loss += l.item()
            train_acc += (y_pred.argmax(dim=1) == y).sum().item()
            train_iters += 1
            train_passed += len(X)

        test_iters, test_passed = 0, 0
        test_loss, test_acc = 0., 0.
        model.eval()
        for X, y in test:
            y_pred = model(X)
            l = loss(y_pred, y)
            test_loss += l.item()
            test_acc += (y_pred.argmax(dim=1) == y).sum().item()
            test_iters += 1
            test_passed += len(X)

        print("ep: {}, taked: {:.3f}, train_loss: {}, train_acc: {}, test_loss: {}, test_acc: {}".format(
            ep, time.time() - start, train_loss / train_iters, train_acc / train_passed,
            test_loss / test_iters, test_acc / test_passed
        ))            

In [18]:
train_model()

ep: 0, taked: 14.711, train_loss: 2.0447713253345894, train_acc: 0.5564666666666667, test_loss: 1.6875344544649125, test_acc: 0.7272
ep: 1, taked: 14.259, train_loss: 1.3392624091594778, train_acc: 0.7661166666666667, test_loss: 1.0099389925599098, test_acc: 0.815
ep: 2, taked: 15.268, train_loss: 0.8684946063985216, train_acc: 0.8232666666666667, test_loss: 0.7149699538946152, test_acc: 0.8473
ep: 3, taked: 14.432, train_loss: 0.6667184118260728, train_acc: 0.8484333333333334, test_loss: 0.5808485291898251, test_acc: 0.8671
ep: 4, taked: 14.437, train_loss: 0.5652414571097557, train_acc: 0.8633, test_loss: 0.5059726864099503, test_acc: 0.8768
ep: 5, taked: 14.125, train_loss: 0.5046328397507364, train_acc: 0.87315, test_loss: 0.45837222747504713, test_acc: 0.8837
ep: 6, taked: 14.071, train_loss: 0.4643702208361727, train_acc: 0.8806833333333334, test_loss: 0.42556196376681327, test_acc: 0.8889
ep: 7, taked: 14.642, train_loss: 0.43568150629388525, train_acc: 0.8850666666666667, test_

## Model Adam

In [19]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 10)
)

In [20]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 9.516, train_loss: 0.25391602437705435, train_acc: 0.9229166666666667, test_loss: 0.15667263865470887, test_acc: 0.9482
ep: 1, taked: 9.098, train_loss: 0.10370491938269202, train_acc: 0.9677833333333333, test_loss: 0.12810333910747432, test_acc: 0.9632
ep: 2, taked: 9.704, train_loss: 0.0734646969316329, train_acc: 0.97665, test_loss: 0.10285195446922443, test_acc: 0.9688
ep: 3, taked: 9.026, train_loss: 0.05870656998867684, train_acc: 0.9810833333333333, test_loss: 0.1163265920818958, test_acc: 0.9685
ep: 4, taked: 9.872, train_loss: 0.05028912071138621, train_acc: 0.9841, test_loss: 0.11802914545114618, test_acc: 0.9715
ep: 5, taked: 9.977, train_loss: 0.04836315082465398, train_acc: 0.9844, test_loss: 0.16499243976468278, test_acc: 0.9653
ep: 6, taked: 10.043, train_loss: 0.04699395415224531, train_acc: 0.9848333333333333, test_loss: 0.12980500765770558, test_acc: 0.9693
ep: 7, taked: 10.206, train_loss: 0.04187850180636853, train_acc: 0.9866833333333334, test_loss: 0

## Adding more layers

In [21]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 10)
)

In [22]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 13.081, train_loss: 0.3180074578111476, train_acc: 0.9014666666666666, test_loss: 0.223648487101309, test_acc: 0.9338
ep: 1, taked: 13.382, train_loss: 0.12756827155801845, train_acc: 0.9618, test_loss: 0.15645248654764146, test_acc: 0.9563
ep: 2, taked: 11.633, train_loss: 0.10261478880420327, train_acc: 0.9704333333333334, test_loss: 0.13927648604731074, test_acc: 0.9628
ep: 3, taked: 11.983, train_loss: 0.08527089793710632, train_acc: 0.9751666666666666, test_loss: 0.16636404730124923, test_acc: 0.9622
ep: 4, taked: 12.557, train_loss: 0.0720702709749024, train_acc: 0.9792333333333333, test_loss: 0.14123587989452063, test_acc: 0.9684
ep: 5, taked: 12.738, train_loss: 0.06669331000761149, train_acc: 0.9816, test_loss: 0.11939454521984771, test_acc: 0.9703
ep: 6, taked: 13.085, train_loss: 0.06311873052606082, train_acc: 0.9821833333333333, test_loss: 0.1559115111949268, test_acc: 0.9689
ep: 7, taked: 13.213, train_loss: 0.07111477443373743, train_acc: 0.9811333333333333

## Batch Normalisation

In [23]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(512),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(256), 
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(128),
    torch.nn.Linear(128, 10)
)

In [24]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 11.705, train_loss: 0.20078014101674582, train_acc: 0.9384166666666667, test_loss: 0.15581530092749746, test_acc: 0.9511
ep: 1, taked: 13.880, train_loss: 0.10084698551948718, train_acc: 0.9689333333333333, test_loss: 0.1177217838994693, test_acc: 0.9637
ep: 2, taked: 12.637, train_loss: 0.06977319945085873, train_acc: 0.9782666666666666, test_loss: 0.11669842848787085, test_acc: 0.9652
ep: 3, taked: 12.383, train_loss: 0.0513533908318966, train_acc: 0.98375, test_loss: 0.0968886579736136, test_acc: 0.9713
ep: 4, taked: 12.645, train_loss: 0.04382366273908856, train_acc: 0.9857, test_loss: 0.10862712002708577, test_acc: 0.9706
ep: 5, taked: 12.978, train_loss: 0.0368586354527345, train_acc: 0.9878833333333333, test_loss: 0.15484228722343688, test_acc: 0.9615
ep: 6, taked: 13.075, train_loss: 0.031262484824958636, train_acc: 0.9900333333333333, test_loss: 0.10331450342437165, test_acc: 0.9714
ep: 7, taked: 13.187, train_loss: 0.028511889515187037, train_acc: 0.990066666666

## Dropout

In [29]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 2560),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.Linear(2560, 1280),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5), 
    torch.nn.Linear(1280, 640),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.Linear(640, 10)
)

In [30]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 53.263, train_loss: 0.9490649714114818, train_acc: 0.7283666666666667, test_loss: 0.3883506150916219, test_acc: 0.9078
ep: 1, taked: 52.852, train_loss: 0.6872186461661725, train_acc: 0.8168166666666666, test_loss: 0.3824665263295174, test_acc: 0.9185
ep: 2, taked: 52.955, train_loss: 0.70492971095633, train_acc: 0.8150666666666667, test_loss: 0.36543120313435795, test_acc: 0.9201
ep: 3, taked: 54.947, train_loss: 0.7051444132277306, train_acc: 0.81755, test_loss: 0.37834984697401525, test_acc: 0.9165
ep: 4, taked: 54.543, train_loss: 0.7429995694059006, train_acc: 0.8094, test_loss: 0.3896180786192417, test_acc: 0.9192
ep: 5, taked: 50.177, train_loss: 0.7311791691374272, train_acc: 0.8089, test_loss: 0.3854216694831848, test_acc: 0.9158
ep: 6, taked: 47.639, train_loss: 0.7727091439226841, train_acc: 0.7993833333333333, test_loss: 0.40638332851231096, test_acc: 0.9132
ep: 7, taked: 44.999, train_loss: 0.8170584922141217, train_acc: 0.7877166666666666, test_loss: 0.44148