In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
from pathlib import Path
from torch import tensor,nn
import torch.nn.functional as F

In [2]:
from fastcore.test import test_close

In [3]:
torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
torch.manual_seed(1)
mpl.rcParams['image.cmap'] = 'gray'

In [4]:
path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

## Basic training loop
Basically the training loop repeats over the following steps:
- get the output of the model on a batch of inputs
- compare the output to the labels we have and compute a loss
- calculate the gradients of the loss with respect to every parameter of the model
- update said parameters with those gradients to make them a little bit better

In [5]:
loss_func = F.cross_entropy

In [6]:
n, m = x_train.shape
c = y_train.max() + 1
nh=50
n,m,c, nh

(50000, 784, tensor(10), 50)

In [7]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]

    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [8]:
model = Model(m, nh, 10)
pred = model(x_train)
pred

tensor([[-0.09, -0.21, -0.08,  ..., -0.03,  0.01,  0.06],
        [-0.07, -0.14, -0.14,  ...,  0.03,  0.04,  0.14],
        [-0.19, -0.04,  0.02,  ..., -0.01, -0.00,  0.02],
        ...,
        [-0.03, -0.22, -0.04,  ..., -0.01,  0.09,  0.14],
        [-0.10, -0.09, -0.05,  ..., -0.01,  0.02,  0.11],
        [-0.03, -0.25, -0.06,  ...,  0.00,  0.03,  0.14]], grad_fn=<AddmmBackward0>)

In [9]:
# bs
bs = 50
xb = x_train[0: bs]
preds = model(xb)

In [10]:
yb = y_train[0:bs]
loss_func(preds, yb)

tensor(2.30, grad_fn=<NllLossBackward0>)

In [11]:
preds.argmax(dim=1)

tensor([3, 9, 3, 8, 5, 9, 3, 9, 3, 9, 5, 3, 9, 9, 3, 9, 9, 5, 8, 7, 9, 5, 3, 8, 9, 5, 9, 5, 5, 9, 3, 5, 9, 7, 5, 7, 9, 9, 3, 9, 3, 5, 3, 8,
        3, 5, 9, 5, 9, 5])

In [15]:
def accuracy(out,yb): return (out.argmax(dim=1)==yb).float().mean()

In [16]:
accuracy(preds, yb)

tensor(0.08)

In [17]:
lr = 0.5
epochs = 3

In [18]:
def report(loss, preds, yb):
    print(f'{loss:.2f}, {accuracy(preds, yb):.2f}')

In [21]:
for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n, i+bs))
        xb, yb = x_train[s], y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l, 'weight'):
                    l.weight -= l.weight.grad * lr
                    l.bias -= l.bias.grad * lr
                    l.weight.grad.zero_()
                    l.bias.grad.zero_()
    report(loss, preds, yb)

0.11, 0.96
0.06, 0.98
0.05, 1.00


In [27]:
## Using parameters and Optimizer
# Parameters

m1 = nn.Module()
m1.foo = nn.Linear(3, 4)
m1.bar = nn.Linear(4, 1)
m1

Module(
  (foo): Linear(in_features=3, out_features=4, bias=True)
  (bar): Linear(in_features=4, out_features=1, bias=True)
)

In [28]:
m1.named_children(), list(m1.named_children())

(<generator object Module.named_children at 0x7f6da0437a70>,
 [('foo', Linear(in_features=3, out_features=4, bias=True)),
  ('bar', Linear(in_features=4, out_features=1, bias=True))])

In [29]:
list(m1.parameters())

[Parameter containing:
 tensor([[ 0.21, -0.44, -0.17],
         [-0.40, -0.56,  0.52],
         [-0.24,  0.34,  0.08],
         [-0.47, -0.47,  0.14]], requires_grad=True),
 Parameter containing:
 tensor([ 0.33,  0.02,  0.03, -0.13], requires_grad=True),
 Parameter containing:
 tensor([[ 0.33, -0.33,  0.38, -0.08]], requires_grad=True),
 Parameter containing:
 tensor([-0.45], requires_grad=True)]

In [30]:
class MLP(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.l1 = nn.Linear(n_in, nh)
        self.l2 = nn.Linear(nh, n_out)
        self.relu = nn.ReLU()
    def forward(self, x):
        return self.l2(self.relu(self.l1(x)))

In [32]:
model = MLP(m, nh, 10)
model

MLP(
  (l1): Linear(in_features=784, out_features=50, bias=True)
  (l2): Linear(in_features=50, out_features=10, bias=True)
  (relu): ReLU()
)

In [33]:
for name, l in model.named_children():
    print(f'{name}: {l}')

l1: Linear(in_features=784, out_features=50, bias=True)
l2: Linear(in_features=50, out_features=10, bias=True)
relu: ReLU()


In [34]:
for p in model.parameters(): print(p.shape)

torch.Size([50, 784])
torch.Size([50])
torch.Size([10, 50])
torch.Size([10])


In [37]:
def fit():
    for epoch in range(epochs):
        for i in range(0, n, bs):
            s = slice(i, min(n, i+bs))
            xb, yb = x_train[s], y_train[s]
            preds = model(xb)
            loss = loss_func(preds, yb)
            loss.backward()
            with torch.no_grad():
                for p in model.parameters():
                    p -= p.grad * lr
                model.zero_grad()
        report(loss, preds, yb)

In [38]:
fit()

0.17, 0.94
0.12, 0.98
0.09, 0.96


Behind the scenes, PyTorch overides the `__setattr__` function in `nn.Module` so that the submodules you define are properly registered as parameters of the model.

In [39]:
class MyModule:
    def __init__(self, n_in, nh, n_out):
        self._modules = {}
        self.l1 = nn.Linear(n_in, nh)
        self.l2 = nn.Linear(nh, n_out)
    def __setattr__(self, k, v):
        if not k.startswith("_"): self._modules[k] = v
        super().__setattr__(k,v)
    def __repr__(self): return f'{self._modules}'
    def parameters(self):
        for l in self._modules.values():
            yield from l.parameters()

In [40]:
mdl = MyModule(m, nh, 10)
mdl

{'l1': Linear(in_features=784, out_features=50, bias=True), 'l2': Linear(in_features=50, out_features=10, bias=True)}

In [41]:
for p in model.parameters(): print(p.shape)

torch.Size([50, 784])
torch.Size([50])
torch.Size([10, 50])
torch.Size([10])


## Registering modules

In [42]:
from functools import reduce

we can use the orignal `layers` approach, but we have to register the modules

In [45]:
layers = [nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10)]

In [46]:
class Model(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = layers
        for i, l in enumerate(self.layers):
            # add_module method is pytorch method 
            # to acknowledge the layers
            self.add_module(f'layer_{i}', l) 
    def forward(self, x):
        return reduce(lambda val, layer: layer(val), self.layers, x)

In [47]:
model = Model(layers)
model

Model(
  (layer_0): Linear(in_features=784, out_features=50, bias=True)
  (layer_1): ReLU()
  (layer_2): Linear(in_features=50, out_features=10, bias=True)
)

In [48]:
model(xb).shape

torch.Size([50, 10])

In [53]:
## nn.ModuleLIST
# It does the same thing as above 
# It takes a list of layers
class Sequential(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)
    def forward(self, x):
        for l in self.layers: x = l(x)
        return x

In [54]:
model = Sequential(layers)
model

Sequential(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=10, bias=True)
  )
)

In [55]:
fit()

0.12, 0.96
0.05, 0.98
0.03, 1.00


In [58]:
loss_func(model(xb), yb), accuracy(model(xb), yb)

(tensor(0.02, grad_fn=<NllLossBackward0>), tensor(1.))

In [59]:
## Optim
class Optimizer():
    def __init__(self, params, lr=0.5):
        self.params, self.lr = list(params), lr
    def step(self):
        with torch.no_grad():
            for p in self.params:
                p -= p.grad * lr
    def zero_grad(self):
        for p in self.params: p.grad.data.zero_()

In [63]:
model = nn.Sequential(nn.Linear(m, nh),
                      nn.ReLU(),
                      nn.Linear(nh, 10))
model

Sequential(
  (0): Linear(in_features=784, out_features=50, bias=True)
  (1): ReLU()
  (2): Linear(in_features=50, out_features=10, bias=True)
)

In [64]:
opt = Optimizer(model.parameters())

In [66]:
for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n, i+bs))
        xb, yb = x_train[s], y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    report(loss, preds, yb)

0.03, 1.00
0.04, 0.98
0.03, 1.00


Pytorch Provides the same functionality with SGD, ADAM

In [71]:
from torch import optim

In [72]:
def get_model():
    model = nn.Sequential(nn.Linear(m, nh), 
                          nn.ReLU(),
                          nn.Linear(nh, n))
    return model, optim.SGD(model.parameters(), lr=lr)

In [74]:
model, opt = get_model()
loss_func(model(xb), yb)

tensor(10.83, grad_fn=<NllLossBackward0>)

In [76]:
for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n, i+bs))
        xb, yb = x_train[s], y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    report(loss, preds, yb)

0.16, 0.96
0.16, 0.94
0.11, 0.98
