In [14]:
from pathlib import Path
import requests

In [15]:
dataPath = Path('data')
Path = dataPath/'mnist'

Path.mkdir(parents=True, exist_ok=True)

In [16]:
url = "http://deeplearning.net/data/mnist/"
filename = "mnist.pkl.gz"

In [17]:
if not (Path/filename).exists():
    content = requests.get(url+filename).content
    (Path / filename).open("wb").write(content)

This dataset is in numpy array format, and has been stored using pickle, a python-specific format for serializing data.

In [18]:
import pickle
import gzip

In [19]:
with gzip.open((Path/filename).as_posix(), 'rb') as file:
    ((xTrain, yTrain), (xValid, yValid), _) = pickle.load(file, encoding="latin-1")

Each image is 28 x 28, and is being stored as a flattened row of length 784 (=28x28). Let’s take a look at one; we need to reshape it to 2d first.

In [None]:
from matplotlib import pyplot
import numpy as np

pyplot.imshow(xTrain[0].reshape((28, 28)), cmap='gray')
print(xTrain.shape)

In [7]:
import torch

In [20]:
xTrain, yTrain, xValid, yValid = map(torch.tensor, (xTrain, yTrain, xValid, yValid))

In [None]:
n, _ = xTrain.shape

## Neural net from scratch (no torch.nn)

We are initializing the weights here with Xavier initialisation (by multiplying with 1/sqrt(n))

In [9]:
import math

In [None]:
weights = torch.randn(784, 10) / math.sqrt(784)
weights.requires_grad_()
bias = torch.zeros(10, requires_grad=True)

In [None]:
def logSoftmax(x):
    return x - x.exp().sum(-1).log().unsqueeze(-1)

def model(xb):
    return logSoftmax(xb @ weights + bias)

In [21]:
bs = 64
xb = xTrain[0:bs]
# preds = model(xb)
# preds[0], preds.shape

In [None]:
def negativeLogLikelihood(input, target):
    return -input[range(target.shape[0]), target].mean()

In [None]:
lossFunc = negativeLogLikelihood

In [22]:
yb = yTrain[0:bs]
# print(lossFunc(preds, yb))

In [None]:
def accuracy(out, yb):
    preds = torch.argmax(out, dim=1)
    return (preds == yb).float().mean()

In [None]:
print(accuracy(preds, yb))

We can now run a training loop. For each iteration, we will:

· select a mini-batch of data (of size bs)
· use the model to make predictions
· calculate the loss
· loss.backward() updates the gradients of the model, in this case, weights and bias.

In [None]:
from IPython.core.debugger import set_trace

In [None]:
lr = 0.5
epochs = 2

In [None]:
for epoch in range(epochs):
    for i in range((n - 1) // bs + 1):
#         set_trace()
        start = i * bs
        end = start + bs
        xb = xTrain[start:end]
        yb = yTrain[start:end]
        pred = model(xb)
        loss = lossFunc(pred, yb)

        loss.backward()
        with torch.no_grad():
            weights -= weights.grad * lr
            bias -= bias.grad * lr
            weights.grad.zero_()
            bias.grad.zero_()

In [None]:
print(lossFunc(model(xb), yb), accuracy(model(xb), yb))

## Using torch.nn.functional

In [2]:
import torch.nn.functional as F

In [3]:
lossFunc = F.cross_entropy

In [None]:
def model(xb):
    return xb @ weights + bias

In [None]:
print(lossFunc(model(xb), yb), accuracy(model(xb), yb))

## Refactor using nn.Module

In [4]:
from torch import nn

In [5]:
class MnistLogistic(nn.Module):
    def __init__(self):
        super().__init__()
        self.weights = nn.Parameter(torch.randn(784, 10) / math.sqrt(784))
        self.bias = nn.Parameter(torch.zeros(10))

    def forward(self, xb):
        print(self.weights)
        print(self.bias)
        return xb @ self.weights + self.bias

In [10]:
model = MnistLogistic()

In [23]:
print(lossFunc(model(xb), yb))

Parameter containing:
tensor([[ 0.0311,  0.0381,  0.0401,  ..., -0.0010,  0.0288,  0.0207],
        [-0.0186,  0.0598, -0.0148,  ..., -0.0340,  0.0136,  0.0754],
        [-0.0870, -0.0383, -0.0123,  ...,  0.1089,  0.0056, -0.1010],
        ...,
        [-0.0177,  0.0675,  0.0111,  ..., -0.0042,  0.0289, -0.0057],
        [-0.0138, -0.0395,  0.0328,  ..., -0.0005,  0.0409,  0.0103],
        [-0.0352,  0.0182,  0.0482,  ..., -0.0102,  0.0357,  0.0436]],
       requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)
tensor(2.4524, grad_fn=<NllLossBackward>)


In [24]:
with torch.no_grad():
    for param in model.parameters():
        print(param)
        print(param.grad)
        param -= param.grad * lr
    model.zero_grad()

Parameter containing:
tensor([[ 0.0311,  0.0381,  0.0401,  ..., -0.0010,  0.0288,  0.0207],
        [-0.0186,  0.0598, -0.0148,  ..., -0.0340,  0.0136,  0.0754],
        [-0.0870, -0.0383, -0.0123,  ...,  0.1089,  0.0056, -0.1010],
        ...,
        [-0.0177,  0.0675,  0.0111,  ..., -0.0042,  0.0289, -0.0057],
        [-0.0138, -0.0395,  0.0328,  ..., -0.0005,  0.0409,  0.0103],
        [-0.0352,  0.0182,  0.0482,  ..., -0.0102,  0.0357,  0.0436]],
       requires_grad=True)
None


NameError: name 'lr' is not defined