In [6]:
from pathlib import Path
import requests

In [7]:
dataPath = Path('data')
Path = dataPath/'mnist'

Path.mkdir(parents=True, exist_ok=True)

In [8]:
url = "http://deeplearning.net/data/mnist/"
filename = "mnist.pkl.gz"

In [9]:
if not (Path/filename).exists():
    content = requests.get(url+filename).content
    (Path / filename).open("wb").write(content)

This dataset is in numpy array format, and has been stored using pickle, a python-specific format for serializing data.

In [10]:
import pickle
import gzip

In [11]:
with gzip.open((Path/filename).as_posix(), 'rb') as file:
    ((xTrain, yTrain), (xValid, yValid), _) = pickle.load(file, encoding="latin-1")

Each image is 28 x 28, and is being stored as a flattened row of length 784 (=28x28). Let’s take a look at one; we need to reshape it to 2d first.

In [12]:
from matplotlib import pyplot
import numpy as np

pyplot.imshow(xTrain[0].reshape((28, 28)), cmap='gray')
print(xTrain.shape)

In [13]:
import torch

In [14]:
xTrain, yTrain, xValid, yValid = map(torch.tensor, (xTrain, yTrain, xValid, yValid))

In [15]:
n, _ = xTrain.shape

## Neural net from scratch (no torch.nn)

We are initializing the weights here with Xavier initialisation (by multiplying with 1/sqrt(n))

In [16]:
import math

In [17]:
weights = torch.randn(784, 10) / math.sqrt(784)
weights.requires_grad_()
bias = torch.zeros(10, requires_grad=True)

In [18]:
def logSoftmax(x):
    return x - x.exp().sum(-1).log().unsqueeze(-1)

def model(xb):
    return logSoftmax(xb @ weights + bias)

In [19]:
bs = 64
xb = xTrain[0:bs]
preds = model(xb)
preds[0], preds.shape

(tensor([-2.3360, -2.4965, -2.3913, -2.2426, -2.0036, -2.0727, -2.4725, -2.3349,
         -2.8022, -2.1149], grad_fn=<SelectBackward>), torch.Size([64, 10]))

In [20]:
def negativeLogLikelihood(input, target):
    return -input[range(target.shape[0]), target].mean()

In [21]:
lossFunc = negativeLogLikelihood

In [22]:
yb = yTrain[0:bs]
print(lossFunc(preds, yb))

tensor(2.2730, grad_fn=<NegBackward>)


In [23]:
def accuracy(out, yb):
    preds = torch.argmax(out, dim=1)
    return (preds == yb).float().mean()

In [24]:
print(accuracy(preds, yb))

tensor(0.1875)


We can now run a training loop. For each iteration, we will:

· select a mini-batch of data (of size bs)
· use the model to make predictions
· calculate the loss
· loss.backward() updates the gradients of the model, in this case, weights and bias.

In [25]:
from IPython.core.debugger import set_trace

In [26]:
lr = 0.5
epochs = 2

In [27]:
for epoch in range(epochs):
    for i in range((n - 1) // bs + 1):
#         set_trace()
        start = i * bs
        end = start + bs
        xb = xTrain[start:end]
        yb = yTrain[start:end]
        pred = model(xb)
        loss = lossFunc(pred, yb)

        loss.backward()
        with torch.no_grad():
            weights -= weights.grad * lr
            bias -= bias.grad * lr
            weights.grad.zero_()
            bias.grad.zero_()

In [28]:
print(lossFunc(model(xb), yb), accuracy(model(xb), yb))

tensor(0.0821, grad_fn=<NegBackward>) tensor(1.)


## Using torch.nn.functional

In [29]:
import torch.nn.functional as F

In [30]:
lossFunc = F.cross_entropy

In [31]:
def model(xb):
    return xb @ weights + bias

In [32]:
print(lossFunc(model(xb), yb), accuracy(model(xb), yb))

tensor(0.0821, grad_fn=<NllLossBackward>) tensor(1.)
