## Loss Functions, Optimizers, & The Training Loop

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [71]:
import math
import gzip
import pickle
import numpy as np
import pandas as pd

In [60]:
import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.nn import init
from torch import tensor

In [55]:
from fastai import datasets

In [31]:
def get_data(MNIST_URL = 'http://deeplearning.net/data/mnist/mnist.pkl'):
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((X_train, y_train), (X_val, y_val), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (X_train, y_train, X_val, y_val))

def normalize(x, m, s):
    return (x-m)/s

In [32]:
torch.nn.modules.conv._ConvNd.reset_parameters??

[0;31mSignature:[0m [0mtorch[0m[0;34m.[0m[0mnn[0m[0;34m.[0m[0mmodules[0m[0;34m.[0m[0mconv[0m[0;34m.[0m[0m_ConvNd[0m[0;34m.[0m[0mreset_parameters[0m[0;34m([0m[0mself[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
    [0;32mdef[0m [0mreset_parameters[0m[0;34m([0m[0mself[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0minit[0m[0;34m.[0m[0mkaiming_uniform_[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mweight[0m[0;34m,[0m [0ma[0m[0;34m=[0m[0mmath[0m[0;34m.[0m[0msqrt[0m[0;34m([0m[0;36m5[0m[0;34m)[0m[0;34m)[0m[0;34m[0m
[0;34m[0m        [0;32mif[0m [0mself[0m[0;34m.[0m[0mbias[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m
[0;34m[0m            [0mfan_in[0m[0;34m,[0m [0m_[0m [0;34m=[0m [0minit[0m[0;34m.[0m[0m_calculate_fan_in_and_fan_out[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mweight[0m[0;34m)[0m[0;34m[0m
[0;34m[0m

In [33]:
X_train, y_train, X_test, y_test = get_data()

In [34]:
train_mean, train_std = X_train.mean(), X_train.std()

In [35]:
X_train = normalize(X_train, train_mean, train_std)
X_test = normalize(X_test, train_mean, train_std)

In [37]:
X_train = X_train.view(-1, 1, 28, 28)
X_test = X_test.view(-1, 1, 28, 28)
X_train.shape, X_test.shape

(torch.Size([50000, 1, 28, 28]), torch.Size([10000, 1, 28, 28]))

In [39]:
n = X_train.shape[0]
c = y_test.max() + 1
nh = 32
n, c

(50000, tensor(10))

Let's create a `Conv2d` layer:

In [40]:
l1 = nn.Conv2d(in_channels=1, out_channels=nh, kernel_size=5)

In [98]:
x = X_test[:100]
y = y_test[:100]
x.shape, y.shape

(torch.Size([100, 1, 28, 28]), torch.Size([100]))

In [43]:
def stats(x):
    return x.mean(), x.std()

In [44]:
stats(l1.weight), stats(l1.bias)

((tensor(-0.0031, grad_fn=<MeanBackward0>),
  tensor(0.1149, grad_fn=<StdBackward0>)),
 (tensor(0.0149, grad_fn=<MeanBackward0>),
  tensor(0.1316, grad_fn=<StdBackward0>)))

Let's check the output:

In [45]:
t = l1(x)

In [46]:
stats(t)

(tensor(0.0071, grad_fn=<MeanBackward0>),
 tensor(0.6753, grad_fn=<StdBackward0>))

We would like the outputs to have a mean of 0 and a standard deviation of 1. The mean is fine but the standard diviation is not quite there.

Let's compare this to the normal Kaiming init with a leak of 1 because we're not using an activation function, remember:

$$LeakyReLU(x,\alpha)=\begin{cases}
x,  & \text{if $x \ge 0$} \\
\alpha x, & \text{if $x < 0$}
\end{cases}$$

When we switch to a normal kiming initialization with $a=1$, we get an output with $\mu \approx 0$ and $\sigma \approx 1$. So far so good.

Let's define `LeakyReLU` which defaults to `ReLU`:

In [61]:
def f1(x, a=0):
    return F.leaky_relu(l1(x), a)

In [64]:
init.kaiming_normal_(l1.weight, a=0)
stats(f1(x))

(tensor(0.5330, grad_fn=<MeanBackward0>),
 tensor(1.0288, grad_fn=<StdBackward0>))

Due to the relu function, the mean is no longer 0, it shifts to $\approx 1/2$

Let's go back to look at how the `Conv2d` layer handle's it:

In [65]:
l1 = nn.Conv2d(1, nh, 5)

In [66]:
stats(f1(x))

(tensor(0.1968, grad_fn=<MeanBackward0>),
 tensor(0.3751, grad_fn=<StdBackward0>))

In [67]:
l1.weight.shape

torch.Size([32, 1, 5, 5])

In [68]:
rec_fs = l1.weight[0,0].numel()
rec_fs

25

In [69]:
nf, ni, *_ = l1.weight.shape
nf, ni

(32, 1)

Let's calculate the number of projections (mappings) **in** and **out**:

In [70]:
fan_in = ni * rec_fs
fan_out = nf * rec_fs
fan_in, fan_out

(25, 800)

In [73]:
def gain(a):
    """Calculates the gain for LeakyReLUs"""
    return math.sqrt(2.0 / (1 + a**2))

In [74]:
gain(1), gain(0), gain(0.01), gain(0.1), gain(math.sqrt(5.))

(1.0,
 1.4142135623730951,
 1.4141428569978354,
 1.4071950894605838,
 0.5773502691896257)

One thing we should remember is that the PyTorch team uses Kaiming **uniform** and not Kaiming **normal**, which have different $\sigma$ dynamics.

We want to know what is the standard deviation of a uniform distribution between $[-1,1]$: 

In [75]:
torch.zeros(10000).uniform_(-1, 1).std()

tensor(0.5752)

In [76]:
1/math.sqrt(3.)

0.5773502691896258

It seems that the PyTorch team wanted the gain to handle uniform random numbers instead of settling on normal distributions (but still doesn't fully work).

Let's implement our own version of kaiming:

In [77]:
def kaiming2(x, a, use_fan_out=False):
    nf, ni, *_ = x.shape
    rec_fs = x[0,0].shape.numel()
    fan = nf * rec_fs if use_fan_out else ni * rec_fs
    std = gain(a) / math.sqrt(fan)
    bound = math.sqrt(3.) * std
    x.data.uniform_(-bound, bound)

In [78]:
kaiming2(l1.weight, a=0);
stats(f1(x))

(tensor(0.5478, grad_fn=<MeanBackward0>),
 tensor(1.0748, grad_fn=<StdBackward0>))

This is good, let's showcase what PyTorch's default does:

In [79]:
kaiming2(l1.weight, a=math.sqrt(5.));
stats(f1(x))

(tensor(0.1922, grad_fn=<MeanBackward0>),
 tensor(0.3417, grad_fn=<StdBackward0>))

Let's take a look at a final distribution's variance after multiple convolutional layers:

In [81]:
class Flatten(nn.Module):
    def forward(self, x):
        return x.view(-1)

In [92]:
m = nn.Sequential(
    nn.Conv2d(1, 8, 5, stride=2, padding=2), nn.ReLU(),
    nn.Conv2d(8, 16, 3, stride=2, padding=1), nn.ReLU(),
    nn.Conv2d(16, 32, 3, stride=2, padding=1), nn.ReLU(),
    nn.Conv2d(32, 1, 3, stride=2, padding=1),
    nn.AdaptiveAvgPool2d(output_size=1),
    Flatten(),
)

In [93]:
t = m(x)
stats(t)

(tensor(0.0694, grad_fn=<MeanBackward0>),
 tensor(0.0095, grad_fn=<StdBackward0>))

This looks like a really big problem, the variance kept decrease from a standard deviation of 1 in the input layer to 0 in the final layer.

Let's take a look at what happens for the loss:

In [96]:
def mse(output, targ):
    return (output.squeeze(-1) - targ).pow(2).mean()

In [99]:
l = mse(t, y)

In [100]:
l.backward()

In [101]:
# stats on the gradients
stats(m[0].weight.grad)

(tensor(0.0041), tensor(0.0478))

Let's now verify Kaiming uniform with $a=0$:

In [102]:
init.kaiming_uniform_??

[0;31mSignature:[0m
[0minit[0m[0;34m.[0m[0mkaiming_uniform_[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtensor[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0ma[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmode[0m[0;34m=[0m[0;34m'fan_in'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnonlinearity[0m[0;34m=[0m[0;34m'leaky_relu'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mkaiming_uniform_[0m[0;34m([0m[0mtensor[0m[0;34m,[0m [0ma[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mmode[0m[0;34m=[0m[0;34m'fan_in'[0m[0;34m,[0m [0mnonlinearity[0m[0;34m=[0m[0;34m'leaky_relu'[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""Fills the input `Tensor` with values according to the method[0m
[0;34m    described in `Delving deep into rectifiers: Surpassing human-level[0m
[0;34m    performance on ImageNet classification` - He, K. et al. (2015), using a[0m
[

In [103]:
for l in m:
    if isinstance(l, nn.Conv2d):
        init.kaiming_uniform_(l.weight)
        l.bias.data.zero_()

In [104]:
t = m(x)
stats(t)

(tensor(-0.6390, grad_fn=<MeanBackward0>),
 tensor(0.2330, grad_fn=<StdBackward0>))

This is better (after directly using `kaiming_uniform_` and not `reset_parameters`).

Let's check the loss:

In [105]:
l = mse(t, y)

In [106]:
l.backward()

In [107]:
# stats on the gradients
stats(m[0].weight.grad)

(tensor(0.1630), tensor(0.3936))

---