In [1]:
from torch import tensor

In [2]:
t1 = tensor([[1], [2], [3]])
t1, t1.shape

(tensor([[1],
         [2],
         [3]]),
 torch.Size([3, 1]))

In [3]:
t2 = tensor([1,2,3])
t2, t2.shape

(tensor([1, 2, 3]), torch.Size([3]))

In [4]:
t1-t2

tensor([[ 0, -1, -2],
        [ 1,  0, -1],
        [ 2,  1,  0]])

what actually is happening is t1 and t2 both are getting broadcasted as the axis matching happens from right to left

We either have to select all the elements in the coloumn or have to `unsqueeze` the tensor

In [5]:
t1[:, 0].shape, t1[:, 0]

(torch.Size([3]), tensor([1, 2, 3]))

In [6]:
t1.squeeze()

tensor([1, 2, 3])

`.squeeze` method removes unit dimmensions.

In [7]:
t1[None,:, None].shape, t1[None,:, None].squeeze().shape

(torch.Size([1, 3, 1, 1]), torch.Size([3]))

Understanding unsqueeze

In [8]:
x = tensor([1, 2, 3, 4])

In [9]:
x.unsqueeze(1)

tensor([[1],
        [2],
        [3],
        [4]])

here `.unsqueeze` method can take dimm -2, -1, 0, 1 which is current dimm_size + 1

# The forward and backward pass

In [10]:
def relu(x): return x.clamp_min(0.)

In [11]:
# Linear unit
def lin(x, w, b): return x@w + b

Gradient for linear output wrt to W: $$\frac{\partial\text{y}} {\partial\text{weight}} = \text{input}^T$$
Gradient for linear output wrt to B: $$\frac{\partial\text{y}} {\partial\text{bias}} = \text{1}$$

In [12]:
def lin_grad(inp, out, w, b):
    # grad of matmul with respect to input
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)

- inp.g -> how does loss is affected with change in input
$$ \frac{\partial\text{loss}}{\partial\text{input}} = \frac{\partial\text{loss}} {\partial\text{output}} * \frac{\partial\text{output}}{\partial\text{input}}$$
we know that $$\frac{\partial\text{output}}{\partial\text{input}} = \text{weight}^T$$
so, $$ \frac{\partial\text{loss}}{\partial\text{input}} = \frac{\partial\text{loss}} {\partial\text{output}} * \text{weight}^T$$
- w.g -> how does loss is affected with change in weight
$$ \frac{\partial\text{loss}}{\partial\text{weight}} = \frac{\partial\text{loss}} {\partial\text{output}} * \frac{\partial\text{output}}{\partial\text{weight}}$$
we know that $$\frac{\partial\text{output}}{\partial\text{weight}} = \text{input}^T$$
so, $$ \frac{\partial\text{loss}}{\partial\text{input}} = \frac{\partial\text{loss}} {\partial\text{output}} * \text{input}^T$$
- b.g -> how does loss is affected with change in bias
$$ \frac{\partial\text{loss}}{\partial\text{bias}} = \frac{\partial\text{loss}} {\partial\text{output}} * \frac{\partial\text{output}}{\partial\text{bias}}$$
we know that $$\frac{\partial\text{output}}{\partial\text{bias}} = 1$$
so, $$ \frac{\partial\text{loss}}{\partial\text{bias}} = \sum\frac{\partial\text{loss}} {\partial\text{output}}$$

In [13]:
def forward_and_backward(inp, targ):
    # forward pass:
    l1 = lin(inp, w1, b1)
    l2 = relu(l1)
    out = lin(l2, w2, b2)
    diff = out[:,0]-targ    #Difference between the prediction and label|| Diff=output-target
    loss = diff.pow(2).mean()   #MSE = diff^2/N
    
    # backward pass:
    out.g = 2.*diff[:,None] / inp.shape[0]   #out.g = 2Diff/N
    lin_grad(l2, out, w2, b2)
    l1.g = (l1>0).float() * l2.g
    lin_grad(inp, l1, w1, b1)

| Forward Pass | |
|---|---|
| Layer 1: $$\text{l1}={input}⋅{w1}+{b1}$$  | w1 and b1 contains gradients |
| ReLU: $$ReLU(l1)=max(0,l1)$$ | l1 contains grads because |
| Layer 2: $$\text{out}={l2}⋅{w2}+{b2}$$  |
| MSE: $$\text{loss}=\frac{1}{N}\sum_{i=1}^{N}(\text{out}_i-\text{targ}_i)^2$$ |

| Backward Pass |
|---|
| Gradient of the Loss with respect to the Output: $$\frac{\partial_{\text{loss}}} {\partial_{\text{out}}} = \frac{2}{N} (\text{out} - \text{targ})$$ |
| Gradient of the Second Linear Layer: Weights$$\frac{\partial_{\text{loss}}} {\partial_{\text{w2}}} = \text{l2}^\text{T}$$ Bias $$\frac{\partial_{\text{loss}}} {\partial_{\text{b2}}} = \sum\frac{\partial_{\text{loss}}} {\partial_{\text{out}}}$$|
| Gradient Through ReLU Activation: $$\frac{\partial_{\text{loss}}} {\partial_{\text{l1}}} = \frac{\partial_{\text{loss}}} {\partial_{\text{l1}}} $$|
| Gradient of First Linear Layer: $$\frac{\partial_{\text{loss}}} {\partial_{\text{w1}}} = \text{inp}^T \frac{\partial_{\text{loss}}} {\partial_{\text{l1}}}$$ $$\frac{\partial_{\text{loss}}} {\partial_{\text{b1}}} = \sum\frac{\partial_{\text{loss}}} {\partial_{\text{l1}}}$$|

In [14]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
from pathlib import Path
from torch import tensor,nn
import torch.nn.functional as F

In [15]:
from fastcore.test import test_close

torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
torch.manual_seed(1)
mpl.rcParams['image.cmap'] = 'gray'

path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f:
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding = 'latin-1')
    x_train, y_train, x_valid, y_valid = map(tensor,[x_train, y_train, x_valid, y_valid])

In [16]:
x_train.shape, y_train.shape

(torch.Size([50000, 784]), torch.Size([50000]))

In [17]:
x_valid.shape, y_valid.shape

(torch.Size([10000, 784]), torch.Size([10000]))

In [18]:
n, m = x_train.shape
c = y_train.max() + 1
n, m, c

(50000, 784, tensor(10))

In [19]:
# number of hidden activations
nh = 50

In [20]:
w1 = torch.randn(m, nh)
b1 = torch.randn(nh)
w2 = torch.randn(50, 1)
b2 = torch.randn(1)

#### Passing Validation set through model

In [21]:
# Passing validation set through layer1
t = lin(x_valid, w1, b1)
t.shape

torch.Size([10000, 50])

In [22]:
t = relu(t)
t

tensor([[11.71,  3.73,  0.00,  ...,  0.00,  0.00,  0.23],
        [ 7.35, 12.49,  1.27,  ...,  3.99,  0.00,  0.00],
        [ 9.48,  1.74,  0.00,  ...,  0.00,  0.38,  5.12],
        ...,
        [10.52, 10.22,  0.00,  ...,  0.00,  0.00,  0.42],
        [ 9.36,  0.61,  0.00,  ...,  0.00,  0.00,  0.00],
        [10.39, 14.15,  0.00,  ...,  0.00,  0.00,  0.00]])

In [23]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    return lin(l2, w2, b2)

In [24]:
res = model(x_valid)
res.shape

torch.Size([10000, 1])

## Loss Function: MSE
of course, `mse` is not suitable for multi-class classification; we'll use a better loss function soon.
we'll use mse for now to keep things simple

In [25]:
res.shape, y_valid.shape

(torch.Size([10000, 1]), torch.Size([10000]))

In [26]:
# getting rid of the unit axis
(res.squeeze() - y_valid).shape

torch.Size([10000])

In [27]:
y_train, y_valid = y_train.float(), y_valid.float()

In [28]:
y_train.shape

torch.Size([50000])

In [29]:
preds = model(x_train)
preds.shape

torch.Size([50000, 1])

In [30]:
def mse(output, target): return (output[:, 0]-target).pow(2).mean()

In [31]:
mse(preds, y_train)

tensor(1099.57)

### Gradient & Backward Pass

In [32]:
from sympy import symbols, diff
x, y = symbols('x y')
diff(x**2, x)

2*x

In [33]:
diff(3*x**2 + 9, x)

6*x

In [34]:
def lin_grad(inp, out, w, b):
    # grad of matmul with respect to input
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)

In [35]:
def forward_and_backward(inp, target):
    # forward pass
    l1 = lin(inp, w1, b1)     # l1.shape (input.row, 50)
    l2 = relu(l1)             
    out = lin(l2, w2, b2)      # l2.shape (input.row, 1)
    diff = out[:, 0] - target
    loss = diff.pow(2).mean()  # MSE

    # backward pass
    out.g = 2. * diff[:, None] / inp.shape[0]
    lin_grad(l2, out, w2, b2)
    l1.g = (l1>0).float() * l2.g
    lin_grad(inp, l1, w1, b1)

In [36]:
forward_and_backward(x_train, y_train)

In [37]:
x_train.g.shape

torch.Size([50000, 784])

In [38]:
# Save for testing later
def get_grad(x): return x.g.clone()
chks = w1, w2, b1, b2, x_train
grads = w1g, w2g, b1g, b2g, ig = tuple(map(get_grad, chks))

We cheat a little bit and use PyTorch autograd to check our results.

In [39]:
def mkgrad(x):
    return x.clone().requires_grad_()
ptgrads = w12,w22,b12,b22,xt2 = tuple(map(mkgrad, chks))

In [40]:
def forward(inp, targ):
    l1 = lin(inp, w1, b1)
    l2 = relu(l1)
    out = lin(l2, w2, b2)
    return mse(out, targ)

In [41]:
loss = forward(xt2, y_train)
loss.backward()

In [42]:
xt2.grad.shape

torch.Size([50000, 784])

What is the use of the input gradients? 
- It doesn't take part in the model training process.

In [43]:
for a,b in zip(grads, ptgrads): test_close(a, b.grad, eps=0.01)

TypeError: unsupported operand type(s) for -: 'Tensor' and 'NoneType'

## Refactor Model

### Layer as classes

In [44]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0.)
        return self.out
    def backward(self): self.inp.g = (self.inp>0).float() * self.out.g

In [45]:
class Lin():
    def __init__(self, w, b): self.w, self.b = w, b

    def __call__(self, inp):
        self.inp = inp
        self.out = lin(self.inp, self.w, self.b)
        return self.out

    def backward(self):
        self.inp.g = self.out.g @ self.w.t()
        self.w.g = self.inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)

In [46]:
class MSE():
    def __call__(self, inp, target):
        self.inp, self.target = inp, target
        self.out = mse(inp, target)
        return self.out

    def backward(self):
        self.inp.g = 2* (self.inp.squeeze() -self.target).unsqueeze(-1) / self.target.shape[0]

In [47]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = MSE()

    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)

    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [48]:
model = Model(w1, b1, w2, b2)

In [49]:
loss = model(x_train, y_train)

In [50]:
model.backward()

In [51]:
loss

tensor(1099.57)

In [52]:
test_close(w2g, w2.g, eps=0.01)
test_close(b2g, b2.g, eps=0.01)
test_close(w1g, w1.g, eps=0.01)
test_close(b1g, b1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)

## Module.forward()
Module is a parent/core class which would be inherited and modified by child classes

In [53]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out

    def forward(self): raise Exception('not implemented')
    def backward(self): self.bwd(self.out, *self.args)
    def bwd(self): raise Exception('not implemented')

In [54]:
class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.)
    def bwd(self, out, inp): inp.g = (inp>0).float() * out.g

In [55]:
class Lin(Module):
    def __init__(self, w, b): self.w,self.b = w,b
    def forward(self, inp): return inp@self.w + self.b
    def bwd(self, out, inp):
        inp.g = self.out.g @ self.w.t()
        self.w.g = inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)

In [56]:
class Mse(Module):
    def forward (self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
    def bwd(self, out, inp, targ): inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0]

In [57]:
model = Model(w1, b1, w2, b2)

In [58]:
loss = model(x_train, y_train)

In [59]:
model.backward()

In [60]:
test_close(w2g, w2.g, eps=0.01)
test_close(b2g, b2.g, eps=0.01)
test_close(w1g, w1.g, eps=0.01)
test_close(b1g, b1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)

## Autograd

In [61]:
from torch import nn
import torch.nn.functional as F

In [62]:
class Linear(nn.Module):
    def __init__(self, n_in, n_out):
        super().__init__()
        self.w = torch.randn(n_in, n_out).requires_grad_()
        self.b = torch.zeros(n_out).requires_grad_()
    def forward(self, inp):  return inp@self.w + self.b

In [63]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [Linear(n_in, nh), nn.ReLU(), Linear(nh, n_out)]

    def __call__(self, x, targ):
        for l in self.layers: x=l(x)
        return F.mse_loss(x, targ[:, None])

In [64]:
model = Model(m, nh, 1)
loss = model(x_train, y_train)
loss.backward()
    

In [65]:
l0 = model.layers[0]
l0.b.grad

tensor([-0.05,  4.98, -3.67,  1.19,  3.03, -0.28,  0.27, 13.75,  3.20,  8.55,  1.98,  0.80,  0.65,  0.25, -4.20, -1.09, -3.15,  0.18, -1.19,
        -5.05, -0.81, -2.78,  7.54, -0.73,  3.71,  0.25,  0.70, -7.06,  3.32, -1.14, -3.12,  1.51,  5.81,  1.22, 15.94,  0.14,  1.82, 14.89,
         3.91, -0.21,  1.98,  2.24,  0.63, 16.29, -0.50,  0.53, 21.31, -9.06, -2.52,  0.05])