# Backprop from scratch
In this notebook, we will be implementing backpropagation from scratch.

Backprop is one of the well-known ingredients of why neural networks work as well as they do. At its crux, backprop is simply chain rule of derivatives.

Let's dive right in.

In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
from fastcore.test import test_close
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

## Foundations version
### Basic architecture
Let's start by setting up our data.

In [2]:
n, m = x_train.shape
c = y_train.max() + 1 # number of classes
n, m, c

(50000, 784, tensor(10))

In [3]:
# num hidden
nh = 50

In [4]:
w1 = torch.randn(m, nh)
b1 = torch.randn(nh)
# for the output, we set only 1 value as we will be using MSE 
# and not cross entropy yet
# so think of it as trying to predict an integer from 0 to c-1
w2 = torch.randn(nh, 1)
b2 = torch.randn(1)

In [5]:
def lin(x, w, b):
    return x@w + b

Let's verify that our linear function is working correctly.

In [6]:
t = lin(x_train, w1, b1)
t.shape

torch.Size([50000, 50])

In [7]:
def relu(x): return x.clamp_min(0.)

In [8]:
t = relu(t)
t

tensor([[11.65, 11.50,  0.00,  ...,  6.04,  1.36,  9.80],
        [27.46,  7.34,  5.09,  ...,  0.00,  0.00, 15.98],
        [ 4.80,  0.00,  1.47,  ...,  0.00,  5.63,  3.14],
        ...,
        [ 0.26,  3.48,  0.00,  ...,  1.26,  5.81, 17.91],
        [ 0.00,  5.41,  0.00,  ...,  2.79,  0.07, 10.75],
        [ 7.63, 15.03,  0.00,  ..., 14.50,  2.60, 16.91]])

Now let's define a simple model with 1 non-linear hidden layer, followed by the output layer.

In [9]:
def model(X):
    l1 = lin(X, w1, b1)
    l2 = relu(l1)
    return lin(l2, w2, b2)

In [10]:
res = model(x_train)
res.shape

torch.Size([50000, 1])

### Loss function
As mentioned earlier, we will use MSE as our loss function. This does not make much sense (since we are doing categorical classification and not regression), but we are doing this to simplify the initial backprop code.

In [11]:
res.shape, y_train.shape

(torch.Size([50000, 1]), torch.Size([50000]))

In [12]:
res.squeeze().shape

torch.Size([50000])

In [13]:
y_train, y_valid = y_train.float(), y_valid.float()

In [14]:
preds = model(x_train)
preds.shape

torch.Size([50000, 1])

In [15]:
def mse(pred, actual):
    return ((pred - actual) ** 2).mean()

In [16]:
mse(preds.squeeze(), y_train)

tensor(1966.65)

### Gradients and backward pass

In [36]:
def lin_grad(inp, out, w, b):
    inp.g = out.g @ w.t()
    w.g = inp.t() @ out.g
    b.g = torch.sum(out.g, dim=0)

In [37]:
def relu_grad(inp, out):
    # derivative of loss wrt to lout1 = d(loss)/d(rout1) * d(rout1)/d(lout1)
    inp.g = torch.where(inp <= 0, torch.tensor(0.), torch.tensor(1.)) * out.g

In [38]:
def forward_and_backward(input, target):
    # forward pass
    lout1 = lin(input, w1, b1)
    rout1 = relu(lout1)
    lout2 = lin(rout1, w2, b2)
    diff = lout2.squeeze() - target
    loss = (diff ** 2).mean()

    # backward pass
    lout2.g = (2 * diff[:, None] / input.shape[0])
    # get gradients for rout1, w2, b2
    lin_grad(rout1, lout2, w2, b2)
    # get gradients for lout1
    relu_grad(lout1, rout1)
    # get gradients for input, w1, b1
    lin_grad(input, lout1, w1, b1)

In [39]:
forward_and_backward(x_train, y_train)

In [40]:
# Save for testing against later
def get_grad(x): return x.g.clone()
chks = w1,w2,b1,b2,x_train
grads = w1g,w2g,b1g,b2g,ig = tuple(map(get_grad, chks))

We use PyTorch's `autograd` to check our results.

In [41]:
def mkgrad(x): return x.clone().requires_grad_(True)
ptgrads = w12,w22,b12,b22,xt2 = tuple(map(mkgrad, chks))

In [42]:
def forward(inp, tgt):
    l1 = lin(inp, w12, b12)
    r1 = relu(l1)
    out = lin(r1, w22, b22)
    return mse(out.squeeze(), tgt)

In [43]:
loss = forward(xt2, y_train)
loss.backward()

In [44]:
for a,b in zip(grads, ptgrads): 
    print(a.shape, b.shape)
    test_close(a, b.grad, eps=0.01)

torch.Size([784, 50]) torch.Size([784, 50])
torch.Size([50, 1]) torch.Size([50, 1])
torch.Size([50]) torch.Size([50])
torch.Size([1]) torch.Size([1])
torch.Size([50000, 784]) torch.Size([50000, 784])


Our implementaton is working well! Now we want to refactor the model.

## Refactor model

### Layers as classes

In [98]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
    
    def forward(self): raise Exception('not implemented')
    def backward(self): self.bwd(self.out, *self.args)
    def bwd(self): raise Exception('not implemented')

In [99]:
class ReLu(Module):
    def forward(self, inp): 
        return inp.clamp_min(0.)
    def bwd(self, out, inp): 
        inp.g = torch.where(inp <= 0, torch.tensor(0.), torch.tensor(1.)) * out.g

In [100]:
class Linear(Module):
    def __init__(self, w, b):
        self.w, self.b = w, b
    
    def forward(self, inp):
        return inp@self.w + self.b
    
    def bwd(self, out, inp):
        inp.g = self.out.g @ self.w.t()
        self.w.g = inp.t() @ out.g
        self.b.g = torch.sum(out.g, dim=0)

In [101]:
class MSE(Module):
    def forward(self, inp, tgt):
        self.inp, self.tgt = inp, tgt
        self.diff = inp.squeeze() - tgt
        return (self.diff ** 2).mean()
    
    def bwd(self, out, inp, tgt):
        self.inp.g = (2. * self.diff[:, None]) / self.tgt.shape[0]

In [102]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Linear(w1, b1), ReLu(), Linear(w2, b2)]
        self.loss = MSE()

    def __call__(self, x, tgt):
        for l in self.layers: x = l(x)
        return self.loss(x, tgt)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers):
            l.backward()

In [103]:
model = Model(w1, b1, w2, b2)
loss = model(x_train, y_train)
model.backward()

In [104]:
test_close(w2g, w2.g, eps=0.01)
test_close(b2g, b2.g, eps=0.01)
test_close(w1g, w1.g, eps=0.01)
test_close(b1g, b1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)

## Autograd
Now that we have implemented our own version of `Module`, we can use PyTorch's version.

In [105]:
from torch import nn
import torch.nn.functional as F

In [106]:
class Linear(nn.Module):
    def __init__(self, n_in, n_out):
        super().__init__()
        self.w = torch.randn(n_in, n_out).requires_grad_()
        self.b = torch.zeros(n_out).requires_grad_()

    def forward(self, inp):
        return inp@self.w + self.b

In [108]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [Linear(n_in, nh), nn.ReLU(), Linear(nh, n_out)]

    def __call__(self, x, tgt):
        for l in self.layers: x = l(x)
        return F.mse_loss(x, tgt[:, None])

In [109]:
model = Model(m, nh, 1)
loss = model(x_train, y_train)
loss.backward()

In [110]:
l0 = model.layers[0]
l0.b.grad

tensor([-14.69,   7.04, -16.46,  39.75,  96.35,  18.34,  64.73, -52.65,  77.14,   6.30,  28.86, -25.50,  -0.42,   2.91,
         32.97,  -3.70,   5.94,   6.37, -39.08,  -2.62, -42.50,  16.05, -15.62,  39.80,   5.66, -22.96,  -0.89, -39.27,
          9.37, -12.24,  14.09,  50.00,  -0.96,  -1.72,   5.20,  14.88,   4.63,   4.50,   1.00,  -7.72, -19.31, -23.15,
         -2.87,  -5.87,  37.45, 146.43,  24.27, -82.26,  -5.01,  -2.53])