## **Imports**

In [10]:
import torch
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import random

# ignore warning
import warnings
warnings.filterwarnings('ignore')

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
random.seed(777)
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [3]:
# parameters
learning_rate = 0.001
training_epochs = 15
batch_size = 100

## **MNIST Data**

In [4]:
# MNIST dataset
mnist_train = dsets.MNIST(root='MNIST_data/',
                          train=True,
                          transform=transforms.ToTensor(),
                          download=True)

mnist_test = dsets.MNIST(root='MNIST_data/',
                         train=False,
                         transform=transforms.ToTensor(),
                         download=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to MNIST_data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting MNIST_data/MNIST/raw/train-images-idx3-ubyte.gz to MNIST_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to MNIST_data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting MNIST_data/MNIST/raw/train-labels-idx1-ubyte.gz to MNIST_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to MNIST_data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting MNIST_data/MNIST/raw/t10k-images-idx3-ubyte.gz to MNIST_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to MNIST_data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting MNIST_data/MNIST/raw/t10k-labels-idx1-ubyte.gz to MNIST_data/MNIST/raw



In [5]:
# dataset loader
data_loader = torch.utils.data.DataLoader(dataset=mnist_train,
                                          batch_size=batch_size,
                                          shuffle=True,
                                          drop_last=True)

## **Define Train & Test function**

In [38]:
def train(model, optimizer, criterion):
    total_batch = len(data_loader)
    for epoch in range(training_epochs):
        avg_cost = 0

        for X, Y in data_loader:
            # reshape input image into [batch_size by 784]
            # label is not one-hot encoded
            X = X.view(-1, 28 * 28).to(device)
            Y = Y.to(device)

            optimizer.zero_grad()
            hypothesis = model(X)
            cost = criterion(hypothesis, Y)
            cost.backward()
            optimizer.step()

            avg_cost += cost / total_batch

        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

    print('Learning finished')

def test(model, test_data):
    # Test the model using test sets
    with torch.no_grad():
        X_test = mnist_test.test_data.view(-1, 28 * 28).float().to(device)
        Y_test = mnist_test.test_labels.to(device)

        prediction = model(X_test)
        correct_prediction = torch.argmax(prediction, 1) == Y_test
        accuracy = correct_prediction.float().mean()
        print('Accuracy:', accuracy.item())

        # Get one and predict
        r = random.randint(0, len(mnist_test) - 1)
        X_single_data = mnist_test.test_data[r:r + 1].view(-1, 28 * 28).float().to(device)
        Y_single_data = mnist_test.test_labels[r:r + 1].to(device)

        print('Label: ', Y_single_data.item())
        single_prediction = linear(X_single_data)
        print('Prediction: ', torch.argmax(single_prediction, 1).item())

## **Model 1 - One Linear Layer (with Adam optimizer)**

In [24]:
# MNIST data image of shape 28 * 28 = 784
linear = torch.nn.Linear(784, 10, bias=True).to(device)

# Initialization
torch.nn.init.normal_(linear.weight)

Parameter containing:
tensor([[-0.6218,  1.0672,  1.9105,  ..., -0.6703,  1.2395, -0.0997],
        [-1.6817, -0.4872,  0.0357,  ...,  0.2073, -0.2950,  0.0056],
        [-0.7940,  0.4732, -0.2595,  ...,  1.6281,  0.5415, -1.8691],
        ...,
        [ 1.7984,  0.0532,  0.1820,  ...,  0.0388, -1.3177, -0.2541],
        [-0.1714, -0.4625, -1.1287,  ..., -0.5395, -2.3290,  0.0188],
        [-1.7208, -1.5439,  0.1333,  ...,  1.4105, -0.8442, -1.7256]],
       requires_grad=True)

In [25]:
# define cost/loss & optimizer
criterion = torch.nn.CrossEntropyLoss().to(device)    # Softmax is internally computed.
optimizer = torch.optim.Adam(linear.parameters(), lr=learning_rate)

In [26]:
train(linear, optimizer, criterion)

Epoch: 0001 cost = 5.817247868
Epoch: 0002 cost = 1.632607937
Epoch: 0003 cost = 1.093905926
Epoch: 0004 cost = 0.868305087
Epoch: 0005 cost = 0.739665151
Epoch: 0006 cost = 0.656013787
Epoch: 0007 cost = 0.595838964
Epoch: 0008 cost = 0.551452458
Epoch: 0009 cost = 0.516630530
Epoch: 0010 cost = 0.488287657
Epoch: 0011 cost = 0.465269953
Epoch: 0012 cost = 0.444853485
Epoch: 0013 cost = 0.428867012
Epoch: 0014 cost = 0.413870811
Epoch: 0015 cost = 0.400662214
Learning finished


In [27]:
test(linear, mnist_test)

Accuracy: 0.8837000131607056
Label:  9
Prediction:  9


## **Model 2 - Multi Layer (3 linear, ReLU activation function)**

In [41]:
# nn layers
linear1 = torch.nn.Linear(784, 256, bias=True)
linear2 = torch.nn.Linear(256, 256, bias=True)
linear3 = torch.nn.Linear(256, 10, bias=True)
relu = torch.nn.ReLU()

In [42]:
# Initialization
torch.nn.init.normal_(linear1.weight)
torch.nn.init.normal_(linear2.weight)
torch.nn.init.normal_(linear3.weight)

Parameter containing:
tensor([[ 0.5062, -0.3522,  0.7147,  ...,  0.0053,  0.5866,  1.4561],
        [ 0.4197,  0.2506,  0.6990,  ..., -0.7644, -2.0132,  0.0166],
        [ 0.6500,  0.1878, -0.9271,  ..., -0.3992,  1.5599,  0.5787],
        ...,
        [-0.6328, -0.6657, -0.5207,  ..., -1.4250,  0.8357,  1.0101],
        [ 1.5887,  1.8519, -0.8953,  ..., -0.3045,  1.2169, -1.7931],
        [-0.3066,  0.0921, -0.9875,  ...,  0.3064,  1.5533, -0.0460]],
       requires_grad=True)

In [43]:
# model
model = torch.nn.Sequential(linear1, relu, linear2, relu, linear3).to(device)

# define cost/loss & optimizer
criterion = torch.nn.CrossEntropyLoss().to(device)    # Softmax is internally computed.
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [44]:
train(model, optimizer, criterion)

Epoch: 0001 cost = 137.270019531
Epoch: 0002 cost = 37.353683472
Epoch: 0003 cost = 23.393787384
Epoch: 0004 cost = 16.017894745
Epoch: 0005 cost = 11.591639519
Epoch: 0006 cost = 8.599291801
Epoch: 0007 cost = 6.442617416
Epoch: 0008 cost = 4.797049999
Epoch: 0009 cost = 3.581562281
Epoch: 0010 cost = 2.781718969
Epoch: 0011 cost = 1.992229342
Epoch: 0012 cost = 1.584466219
Epoch: 0013 cost = 1.195554495
Epoch: 0014 cost = 0.979234576
Epoch: 0015 cost = 0.867656291
Learning finished


In [45]:
test(model, mnist_test)

Accuracy: 0.9466000199317932
Label:  5
Prediction:  5


## **Model 3 : Weight Initialization - Xavier (VS. normal)**

In [33]:
# nn layers
linear1 = torch.nn.Linear(784, 256, bias=True)
linear2 = torch.nn.Linear(256, 256, bias=True)
linear3 = torch.nn.Linear(256, 10, bias=True)
relu = torch.nn.ReLU()

In [34]:
# xavier initialization
# torch.nn.init.normal_(linear1.weight)
torch.nn.init.xavier_uniform_(linear1.weight)
torch.nn.init.xavier_uniform_(linear2.weight)
torch.nn.init.xavier_uniform_(linear3.weight)

Parameter containing:
tensor([[-0.1238,  0.0384, -0.1431,  ..., -0.0493,  0.0786, -0.0188],
        [ 0.0042, -0.0895,  0.0886,  ..., -0.0219, -0.0061,  0.1270],
        [ 0.1007, -0.0355,  0.0205,  ..., -0.0528, -0.0563,  0.0999],
        ...,
        [ 0.0454, -0.1021, -0.1085,  ...,  0.0185,  0.0683,  0.1073],
        [ 0.0954, -0.1459, -0.1154,  ..., -0.0130, -0.0462,  0.1128],
        [ 0.1300,  0.0556,  0.1432,  ..., -0.0296,  0.0325,  0.0676]],
       requires_grad=True)

In [35]:
# model
model = torch.nn.Sequential(linear1, relu, linear2, relu, linear3).to(device)

# define cost/loss & optimizer
criterion = torch.nn.CrossEntropyLoss().to(device)    # Softmax is internally computed.
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [36]:
train(model, optimizer, criterion)

Epoch: 0001 cost = 0.243659705
Epoch: 0002 cost = 0.091648906
Epoch: 0003 cost = 0.058327612
Epoch: 0004 cost = 0.042778496
Epoch: 0005 cost = 0.031895939
Epoch: 0006 cost = 0.023814850
Epoch: 0007 cost = 0.022118004
Epoch: 0008 cost = 0.018783642
Epoch: 0009 cost = 0.014914327
Epoch: 0010 cost = 0.012655593
Epoch: 0011 cost = 0.015068561
Epoch: 0012 cost = 0.010977863
Epoch: 0013 cost = 0.010484022
Epoch: 0014 cost = 0.010740603
Epoch: 0015 cost = 0.010132166
Learning finished


In [40]:
test(model, mnist_test)

Accuracy: 0.9794999957084656
Label:  1
Prediction:  1


## **Model 4 - Deeper Model (5 linear)**

In [46]:
# nn layers
linear1 = torch.nn.Linear(784, 512, bias=True)
linear2 = torch.nn.Linear(512, 512, bias=True)
linear3 = torch.nn.Linear(512, 512, bias=True)
linear4 = torch.nn.Linear(512, 512, bias=True)
linear5 = torch.nn.Linear(512, 10, bias=True)
relu = torch.nn.ReLU()

In [47]:
# xavier initialization
torch.nn.init.xavier_uniform_(linear1.weight)
torch.nn.init.xavier_uniform_(linear2.weight)
torch.nn.init.xavier_uniform_(linear3.weight)
torch.nn.init.xavier_uniform_(linear4.weight)
torch.nn.init.xavier_uniform_(linear5.weight)

Parameter containing:
tensor([[-0.1071,  0.0856,  0.0767,  ..., -0.0279,  0.0979,  0.0002],
        [-0.0300, -0.0025,  0.0971,  ...,  0.0918,  0.1060,  0.0443],
        [-0.0868,  0.0477, -0.0066,  ..., -0.0556, -0.0345,  0.0071],
        ...,
        [-0.0901, -0.0643, -0.0142,  ..., -0.0442,  0.0378,  0.0358],
        [ 0.0952, -0.0082, -0.1013,  ...,  0.0596, -0.0298, -0.0299],
        [ 0.0609, -0.1015, -0.0833,  ...,  0.0510,  0.0917,  0.0871]],
       requires_grad=True)

In [48]:
# model
model = torch.nn.Sequential(linear1, relu, linear2, relu, linear3).to(device)

# define cost/loss & optimizer
criterion = torch.nn.CrossEntropyLoss().to(device)    # Softmax is internally computed.
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [49]:
train(model, optimizer, criterion)

Epoch: 0001 cost = 0.279326618
Epoch: 0002 cost = 0.088882513
Epoch: 0003 cost = 0.057267334
Epoch: 0004 cost = 0.040889185
Epoch: 0005 cost = 0.031377584
Epoch: 0006 cost = 0.025283277
Epoch: 0007 cost = 0.021344168
Epoch: 0008 cost = 0.018074280
Epoch: 0009 cost = 0.015873857
Epoch: 0010 cost = 0.013780947
Epoch: 0011 cost = 0.012722512
Epoch: 0012 cost = 0.012164704
Epoch: 0013 cost = 0.011764123
Epoch: 0014 cost = 0.010936635
Epoch: 0015 cost = 0.006708647
Learning finished


In [50]:
test(model, mnist_test)

Accuracy: 0.9800999760627747
Label:  8
Prediction:  8
