## The forward and backward pass

In [75]:
import pickle, gzip, math, os, time, shutil, torch, matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
from fastcore.test import test_close
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'

from urllib.request import urlretrieve
if not path_gz.exists(): urlretrieve(MNIST_URL, path_gz)

with gzip.open(path_gz, 'rb') as f:
  ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

## Foundations version

### Basic architecture

In [8]:
n, m = x_train.shape
c = y_train.max() + 1
n,m,c

(50000, 784, tensor(10))

In [9]:
# num hidden
nh = 50

In [11]:
w1 = torch.randn(m,nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)
b2 = torch.zeros(1)

In [12]:
def lin(x,w,b):
  return x@w + b

In [13]:
t = lin(x_valid, w1, b1)
t.shape

torch.Size([10000, 50])

In [14]:
def relu(x):
  return x.clamp_min(0.)

In [15]:
t = relu(t)
t

tensor([[ 0.00,  0.00,  0.00,  ..., 13.72,  3.09,  0.00],
        [ 8.38,  0.00,  0.00,  ...,  4.98,  0.00,  0.00],
        [ 1.60,  0.00,  0.00,  ..., 12.74,  0.00,  0.00],
        ...,
        [ 0.00,  0.00,  0.00,  ...,  0.96,  4.75,  0.00],
        [ 0.24,  4.24,  0.00,  ..., 17.95,  1.31,  0.00],
        [ 0.00,  0.00,  0.00,  ...,  6.28,  0.80,  4.29]])

In [16]:
def model(xb):
  l1 = lin(xb, w1, b1)
  l2 = relu(l1)
  out = lin(l2, w2, b2)
  return out

In [17]:
res = model(x_valid)
res.shape

torch.Size([10000, 1])

### Loss function: MSE

In [22]:
res.shape, y_valid.shape

(torch.Size([10000, 1]), torch.Size([10000]))

In [29]:
(res.squeeze()-y_valid).shape

torch.Size([10000])

In [30]:
y_train, y_valid = y_train.float(), y_valid.float()

preds = model(x_train)
preds.shape

torch.Size([50000, 1])

In [63]:
def mse(output, target):
  return (output[:,0]-target).pow(2).mean()

In [35]:
mse(preds, y_train)

tensor(1879.49)

### Gradients and backward pass

In [38]:
from sympy import symbols, diff
x,y = symbols('x y')
diff(x**2, x)

2*x

In [46]:
def lin_grad(inp, out, w, b):
  # grad of matmul with respect to input
  inp.g = out.g @ w.t()
  w.g = inp.T @ out.g
  b.g = out.g.sum(0)

In [41]:
def forward_and_backward(inp, targ):
  # forward pass
  l1 = lin(inp, w1, b1)
  l2 = relu(l1)
  out = lin(l2, w2, b2)
  diff = out[:,0] - targ
  loss = diff.pow(2).mean()

  # backward pass
  out.g = 2.*diff[:,None] / inp.shape[0]
  lin_grad(l2, out, w2, b2)
  l1.g = (l1>0).float() * l2.g
  lin_grad(inp,l1,w1,b1)

In [44]:
forward_and_backward(x_train, y_train)

In [45]:
x_train.g

tensor([[-0.00,  0.00, -0.00,  ..., -0.00, -0.00, -0.00],
        [-0.00, -0.00,  0.00,  ...,  0.00,  0.00,  0.00],
        [-0.01, -0.00, -0.01,  ..., -0.00, -0.00,  0.00],
        ...,
        [ 0.00,  0.01, -0.01,  ...,  0.00, -0.01, -0.00],
        [-0.01, -0.00, -0.02,  ...,  0.00, -0.02, -0.01],
        [-0.00,  0.00, -0.01,  ...,  0.01, -0.03, -0.01]])

In [47]:
def get_grad(x):
  return x.g.clone()

chks = w1,w2,b1,b2,x_train
grads = w1g,w2g,b1g,b2g,ig = map(get_grad,chks)

In [48]:
def mkgrad(x):
  return x.clone().requires_grad_(True)
ptgrads = w12, w22, b12, b22, xt2 = map(mkgrad, chks)

In [49]:
def forward(inp, targ):
  l1 = lin(inp, w12, b12)
  l2 = relu(l1)
  out = lin(l2,w22,b22)
  return mse(out, targ)

In [50]:
loss = forward(xt2, y_train)
loss.backward()

In [51]:
for a, b in zip(grads, ptgrads):
  test_close(a.grad, b, eps=0.01)

## Refactor model

### Layers as classes

In [92]:
class ReLU():
  def __call__(self, inp):
    self.inp = inp
    self.out = inp.clamp_min(0.)
    return self.out
  
  def backward(self):
    self.inp.g = (self.inp>0).float() * self.out.g

In [93]:
class Lin():
  def __init__(self, w, b):
    self.w = w
    self.b = b
  
  def __call__(self, inp):
    self.inp = inp
    self.out = lin(inp, self.w, self.b)
    return self.out

  def backward(self):
    self.inp.g = self.out.g @ self.w.t()
    self.w.g = self.inp.t() @ self.out.g
    self.b.g = self.out.g.sum(0)

In [94]:
class Mse():
  def __call__(self, inp, targ):
    self.inp = inp
    self.targ = targ
    self.out = mse(inp, targ)
    return self.out
  
  def backward(self):
    self.inp.g = 2 * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]

In [95]:
class Model():
  def __init__(self, w1, b1, w2, b2):
    self.layers = [Lin(w1,b1), ReLU(), Lin(w2,b2)]
    self.loss = Mse()
  
  def __call__(self, x, targ):
    for l in self.layers:
      x = l(x)
      print(x.shape, targ.shape)
      return self.loss(x, targ)
    
  def backward(self):
    self.loss.backward()
    for l in reversed(self.layers):
      l.backward()

In [96]:
model = Model(w1, b1, w2, b2)

In [97]:
loss = model(x_train, y_train)

torch.Size([50000, 50]) torch.Size([50000])


### Module.forward()

In [119]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out

    def forward(self): raise Exception('not implemented')
    def backward(self): self.bwd(self.out, *self.args)
    def bwd(self): raise Exception('not implemented')


In [120]:
class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.)
    def bwd(self, out, inp): inp.g = (inp>0).float() * out.g

In [121]:

class Lin(Module):
    def __init__(self, w, b): self.w,self.b = w,b
    def forward(self, inp): return inp@self.w + self.b
    def bwd(self, out, inp):
        inp.g = self.out.g @ self.w.t()
        self.w.g = inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)


In [122]:
class Mse(Module):
    def forward (self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
    def bwd(self, out, inp, targ): inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0]

In [123]:
model = Model(w1, b1, w2, b2)

In [124]:
loss = model(x_train, y_train)

torch.Size([50000, 50]) torch.Size([50000])


RuntimeError: ignored

In [118]:
model.backward()

AttributeError: ignored

### Autograd

In [125]:
from torch import nn
import torch.nn.functional as F

In [132]:
class Linear(nn.Module):
  def __init__(self, n_in, n_out):
    super().__init__()
    self.w = torch.randn(n_in, n_out).requires_grad_()
    self.b = torch.zeros(n_out).requires_grad_()

  def forward(self, inp):
    return inp @ self.w + self.b

In [133]:
class Model(nn.Module):
  def __init__(self, n_in,nh, n_out):
    super().__init__()
    self.layers = [Linear(n_in, n_out), nn.ReLU(), Linear(nh, n_out)]
  
  def __call__(self, x, targ):
    for l in self.layers:
      x = l(x)
    return F.mse_loss(x, targ[:,None])

In [138]:
model = Model(m,nh, 10)

In [139]:
loss = model(x_train, y_train)

RuntimeError: ignored

In [None]:
x_train.shape

In [137]:
c

tensor(10)