In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
from pathlib import Path
from torch import tensor,nn
import torch.nn.functional as F

In [2]:
from fastcore.test import test_close

In [3]:
torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
torch.manual_seed(1)
mpl.rcParams['image.cmap'] = 'gray'

In [4]:
path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

## Initial Setup
### Data

In [5]:
n, m = x_train.shape
c = y_train.max() + 1
nh=50
n,m,c, nh

(50000, 784, tensor(10), 50)

In [6]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]

    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [7]:
model = Model(m, nh, 10)
pred = model(x_train)
pred

tensor([[-0.09, -0.21, -0.08,  ..., -0.03,  0.01,  0.06],
        [-0.07, -0.14, -0.14,  ...,  0.03,  0.04,  0.14],
        [-0.19, -0.04,  0.02,  ..., -0.01, -0.00,  0.02],
        ...,
        [-0.03, -0.22, -0.04,  ..., -0.01,  0.09,  0.14],
        [-0.10, -0.09, -0.05,  ..., -0.01,  0.02,  0.11],
        [-0.03, -0.25, -0.06,  ...,  0.00,  0.03,  0.14]], grad_fn=<AddmmBackward0>)

## Cross Entropy Loss
First We will compute the softmax of the activation ie, softmax for output.\
Softmax for each activation: $\frac{e^{activation}}{\sum{e^{all_activations}}}$
$$\sigma(z_i) = \frac{e^{z_{i}}}{\sum_{j=1}^K e^{z_{j}}} \ \ \ for\ i=1,2,\dots,K$$


In practice, we will need the log of the softmax when we calculate the loss in order to get the cross entropy loss.
Cross Entropy Loss Function$$-\sum_{c=1}^My_{o,c}\log(\sigma(c_i))$$ ie, sum of output times the log of softmax

In [13]:
def log_softmax(x): return (x.exp()/(x.exp().sum(-1, keepdim=True))).log()
# torch.exp() => e^x

In [14]:
log_softmax(pred)

tensor([[-2.37, -2.49, -2.36,  ..., -2.31, -2.28, -2.22],
        [-2.37, -2.44, -2.44,  ..., -2.27, -2.26, -2.16],
        [-2.48, -2.33, -2.28,  ..., -2.30, -2.30, -2.27],
        ...,
        [-2.33, -2.52, -2.34,  ..., -2.31, -2.21, -2.16],
        [-2.38, -2.38, -2.33,  ..., -2.29, -2.26, -2.17],
        [-2.33, -2.55, -2.36,  ..., -2.29, -2.27, -2.16]], grad_fn=<LogBackward0>)

We know that, $\log(\frac{a}{b})= \log(a) - \log(b) $. This can be used to simplify the cross entropy

In [16]:
#therefore
# def log_softmax(x): return x.exp().log() - (x.exp().sum(-1, keepdim=True)).log()
# we know that log(e^x) = x
def log_softmax(x): return x - (x.exp().sum(-1, keepdim=True)).log()

Then, there is a way to compute the log of the sum of exponentials in a more stable way, called the [LogSumExp trick](https://en.wikipedia.org/wiki/LogSumExp) trick. The idea is to use the following formula:

$$\log \left ( \sum_{j=1}^{n} e^{x_{j}} \right ) = \log \left ( e^{a} \sum_{j=1}^{n} e^{x_{j}-a} \right ) = a + \log \left ( \sum_{j=1}^{n} e^{x_{j}-a} \right )$$

where a is the maximum of $x_j$

In [17]:
def logsumexp(x):
    m = x.max(-1)[0]
    return m + (x-m[:, None]).exp().sum(-1).log()

This way, we will avoid an overflow when taking the exponential of a big activation. In Pytorch, this is already implemented for us

In [19]:
def log_softmax(x): return x - x.logsumexp(-1,keepdim=True)

In [21]:
# Comparing custom version with pytorch implementation
test_close(logsumexp(pred), pred.logsumexp(-1))

In [23]:
sm_pred = log_softmax(pred)
sm_pred, sm_pred.shape

(tensor([[-2.37, -2.49, -2.36,  ..., -2.31, -2.28, -2.22],
         [-2.37, -2.44, -2.44,  ..., -2.27, -2.26, -2.16],
         [-2.48, -2.33, -2.28,  ..., -2.30, -2.30, -2.27],
         ...,
         [-2.33, -2.52, -2.34,  ..., -2.31, -2.21, -2.16],
         [-2.38, -2.38, -2.33,  ..., -2.29, -2.26, -2.17],
         [-2.33, -2.55, -2.36,  ..., -2.29, -2.27, -2.16]], grad_fn=<SubBackward0>),
 torch.Size([50000, 10]))

The cross entropy loss for some target $x$ and some prediction $p(x)$ is given by:

$$ -\sum x\, \log p(x) $$

But since our $x$s are 1-hot encoded (actually, they're just the integer indices), this can be rewritten as $-\log(p_{i})$ where i is the index of the desired target.

This can be done using numpy-style [integer array indexing](https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html#integer-array-indexing). Note that PyTorch supports all the tricks in the advanced indexing methods discussed in that link.

In [24]:
y_train[:3]

tensor([5, 0, 4])

In [26]:
# Getting the prediction index
sm_pred[0, 5], sm_pred[1, 0], sm_pred[2, 4]

(tensor(-2.20, grad_fn=<SelectBackward0>),
 tensor(-2.37, grad_fn=<SelectBackward0>),
 tensor(-2.36, grad_fn=<SelectBackward0>))

In [27]:
sm_pred[[0,1,2], y_train[:3]]

tensor([-2.20, -2.37, -2.36], grad_fn=<IndexBackward0>)

#### Negative likelyhood loss
log_softmax for hot encoded activations = $-\log(p_{i})$ 

In [34]:
def nll(inp, target): return -inp[range(target.shape[0]), target].mean()

In [35]:
sm_pred.shape, y_train.shape

(torch.Size([50000, 10]), torch.Size([50000]))

In [36]:
loss = nll(sm_pred, y_train)
loss

tensor(2.30, grad_fn=<NegBackward0>)

In [37]:
test_close(F.nll_loss(F.log_softmax(pred, -1), y_train), loss, 1e-3)

In [38]:
test_close(F.cross_entropy(pred, y_train), loss, 1e-3)