In [1]:
from __future__ import print_function
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [2]:
batch_size = 64
test_batch_size = 128
epochs = 30
lr = 0.01
cuda = False
seed = 42
optimizer_type = 'SGD' # Adam, RMSprop
momentum = 0.0
weight_decay = 1e-4
nsamples = 1000  # number of training samples to use

use_cuda = cuda and torch.cuda.is_available()

# Set seed
np.random.seed(seed)
torch.manual_seed(seed)
if use_cuda:
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
# Handel GPU stochasticity
torch.backends.cudnn.enabled = use_cuda
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

device = torch.device("cuda" if use_cuda else "cpu")

In [3]:
class Net(nn.Module):
    def __init__(self, image_size=784):
        super(Net, self).__init__()
        # first layer
        self.fc1 = nn.Linear(image_size, 100, bias=True)
        self.BN = nn.BatchNorm1d(100)
        self.ReLU = nn.ReLU()
        # second layer
        self.fc2 = nn.Linear(100, 100, bias=True)
        self.Tanh = nn.Tanh()
        self.Dropout = nn.Dropout(p=0.5)
        # third layer
        self.fc3 = nn.Linear(100, 10, bias=True)

    def forward(self, x):
        # first layer
        x = self.fc1(x)
        x = self.BN(x)
        x = self.ReLU(x)
        
        # second layer
        x = self.fc2(x)
        x = self.Tanh(x)
        x = self.Dropout(x)
        
        # third layer
        logits = self.fc3(x)
        return logits

In [4]:
# transform data: scaling, augmentation, ...
train_transform = transforms.Compose([
        transforms.ColorJitter(brightness=0.1, contrast=0.1),  # Randomly change the brightness, contrast and saturation of an image. For more transfomations see: https://pytorch.org/docs/stable/torchvision/transforms.html
        transforms.ToTensor(),  # Convert a PIL Image or numpy.ndarray to tensor
        transforms.Normalize((0.1307,), (0.3081,))  # translate by 0.13 and scale by 0.308
        ])

test_transform = transforms.Compose([
        transforms.ToTensor(),  # Convert a PIL Image or numpy.ndarray to tensor
        transforms.Normalize((0.1307,), (0.3081,))  # translate by 0.13 and scale by 0.308
        ])


# pytorch has a datasets class with predefined datasets that can be easily downloaded and manipulated
train = datasets.MNIST('./data', train=True, download=True,
                   transform=train_transform)
test = datasets.MNIST('./data', train=False,
                   transform=test_transform)

# subsample the training set to make it more interesting
part_train = torch.utils.data.random_split(train, [nsamples, len(train)-nsamples])[0]

# Dataloader combines a dataset and a sampler, and provides an iterable over the given dataset
# Here I set num_workers to 1. Set it to 4 when working in computational rich environments.
train_loader = torch.utils.data.DataLoader(part_train, batch_size=batch_size, shuffle=True, num_workers=1)
test_loader = torch.utils.data.DataLoader(test, batch_size=test_batch_size, num_workers=1)

# create an instance of our model
model = Net().to(device)
# loss criterion
criterion = nn.CrossEntropyLoss()

# optimizer type
if optimizer_type == 'SGD':
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
elif optimizer_type == 'Adam':
    optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=weight_decay)
elif optimizer_type == 'RMSProp':
    optimizer = optim.RMSProp(model.parameters(), lr=lr, alpha=0.99, eps=1e-08, weight_decay=weight_decay)
else:
    NotImplementedError("optimizer not implemented")

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data\MNIST\raw\train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 12747162.33it/s]


Extracting ./data\MNIST\raw\train-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data\MNIST\raw\train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<?, ?it/s]

Extracting ./data\MNIST\raw\train-labels-idx1-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz





Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data\MNIST\raw\t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 11802950.10it/s]


Extracting ./data\MNIST\raw\t10k-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<?, ?it/s]

Extracting ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw






In [5]:
def test():
    # will notify all your layers that you are in eval mode, that way, 
    # batchnorm or dropout layers will work in eval mode instead of training mode
    model.eval()
    
    test_loss = 0
    correct = 0
    
    # torch.no_grad() impacts the autograd engine and deactivate it. 
    # It will reduce memory usage and speed up computations
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device).view(data.shape[0], 28*28), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item() * target.shape[0]  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max logit
            correct += pred.eq(target.view_as(pred)).sum().item() # compare labels with estimation

    test_loss /= len(test_loader.dataset)
    print('\nTest set loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [6]:
model.train()
for epoch in range(1, epochs + 1):
    cumm_loss = 0
    # iterate over the data
    for (data, target) in train_loader:

        # flatten the data tensor and move it to the GPU (when using a GPU)
        data, target = data.to(device).view(data.shape[0], 28 * 28), target.to(device)

        # Important! sets the gradients of all optimized torch.Tensors to zero. This is because by default,
        # gradients are accumulated in buffers( i.e, not overwritten) whenever .backward() is called.
        optimizer.zero_grad()

        # pass data through the model
        logits = model(data)

        # suffer loss
        loss = criterion(logits, target)
        cumm_loss += loss.item()

        # Use autograd to compute the backward pass. This call will compute the gradient of loss with respect
        # to all Tensors with requires_grad=True. This operation will free the computation graph
        loss.backward()

        # Calling the step function on an Optimizer makes an update to its parameters
        optimizer.step()

    print('Train Epoch: {}\tLoss: {:.6f}'.format(epoch, cumm_loss / len(train_loader)))

test()

Train Epoch: 1	Loss: 2.255307
Train Epoch: 2	Loss: 2.045234
Train Epoch: 3	Loss: 1.880360
Train Epoch: 4	Loss: 1.743898
Train Epoch: 5	Loss: 1.610281
Train Epoch: 6	Loss: 1.501461
Train Epoch: 7	Loss: 1.400937
Train Epoch: 8	Loss: 1.306891
Train Epoch: 9	Loss: 1.233127
Train Epoch: 10	Loss: 1.158161
Train Epoch: 11	Loss: 1.078592
Train Epoch: 12	Loss: 1.015084
Train Epoch: 13	Loss: 0.952537
Train Epoch: 14	Loss: 0.895892
Train Epoch: 15	Loss: 0.862030
Train Epoch: 16	Loss: 0.799765
Train Epoch: 17	Loss: 0.756405
Train Epoch: 18	Loss: 0.721687
Train Epoch: 19	Loss: 0.686366
Train Epoch: 20	Loss: 0.637711
Train Epoch: 21	Loss: 0.630083
Train Epoch: 22	Loss: 0.606033
Train Epoch: 23	Loss: 0.560369
Train Epoch: 24	Loss: 0.536563
Train Epoch: 25	Loss: 0.530064
Train Epoch: 26	Loss: 0.491440
Train Epoch: 27	Loss: 0.479517
Train Epoch: 28	Loss: 0.458539
Train Epoch: 29	Loss: 0.446485
Train Epoch: 30	Loss: 0.438679

Test set loss: 0.4877, Accuracy: 8866/10000 (89%)

