In [2]:
import pickle, gzip, math, os, time, shutil, torch, matplotlib as mpl, numpy as np, matplotlib as plt
from pathlib import Path
from torch import tensor, nn
import torch.nn.functional as F

In [3]:
from fastcore.test import test_close

In [4]:
torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
torch.manual_seed(1)
mpl.rcParams['image.cmap'] = 'gray'

In [5]:
path_data = Path('../data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

# Initial Setup
Data

In [6]:
n, m = x_train.shape
c = y_train.max() + 1
nh = 50
n, m, c, nh

(50000, 784, tensor(10), 50)

In [7]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [8]:
model = Model(m, nh, 10)
pred = model(x_train)
pred, pred.shape

(tensor([[-0.09, -0.21, -0.08,  ..., -0.03,  0.01,  0.06],
         [-0.07, -0.14, -0.14,  ...,  0.03,  0.04,  0.14],
         [-0.19, -0.04,  0.02,  ..., -0.01, -0.00,  0.02],
         ...,
         [-0.03, -0.22, -0.04,  ..., -0.01,  0.09,  0.14],
         [-0.10, -0.09, -0.05,  ..., -0.01,  0.02,  0.11],
         [-0.03, -0.25, -0.06,  ...,  0.00,  0.03,  0.14]], grad_fn=<AddmmBackward0>),
 torch.Size([50000, 10]))

## Cross Entropy Loss
First we will compute the softmax of the activation, ie, softmax for output.

In [15]:
def log_softmax(x): return (x.exp()/(x.exp().sum(-1, keepdim=True))).log()
# torch.exp() => e^x

In [16]:
# def log_softmax(x): return x.exp().log() - x.exp().sum(-1, keepdim=True).log()
def log_softmax(x): return x - (x.exp().sum(-1, keepdim=True)).log()

In [23]:
def logsumexp(x):
    m = x.max(-1)[0]
    return m + (x-m[:, None]).exp().sum(-1).log()

In [28]:
def log_softmax(x): return x - x.logsumexp(-1,keepdim=True)

In [32]:
y_train[:3]  # Actual Values

tensor([5, 0, 4])

In [30]:
sm_pred = log_softmax(pred)
sm_pred, sm_pred.shape

(tensor([[-2.37, -2.49, -2.36,  ..., -2.31, -2.28, -2.22],
         [-2.37, -2.44, -2.44,  ..., -2.27, -2.26, -2.16],
         [-2.48, -2.33, -2.28,  ..., -2.30, -2.30, -2.27],
         ...,
         [-2.33, -2.52, -2.34,  ..., -2.31, -2.21, -2.16],
         [-2.38, -2.38, -2.33,  ..., -2.29, -2.26, -2.17],
         [-2.33, -2.55, -2.36,  ..., -2.29, -2.27, -2.16]], grad_fn=<SubBackward0>),
 torch.Size([50000, 10]))

In [31]:
# Getting the prediction index
sm_pred[0, 5], sm_pred[1, 0], sm_pred[2, 4]

(tensor(-2.20, grad_fn=<SelectBackward0>),
 tensor(-2.37, grad_fn=<SelectBackward0>),
 tensor(-2.36, grad_fn=<SelectBackward0>))

In [36]:
# better indexing
sm_pred[[0,1,2], y_train[:3]]

tensor([-2.20, -2.37, -2.36], grad_fn=<IndexBackward0>)

#### Negative likelyhood loss
log_softmax for hot encoded activations = $-\log(p_{i})$ 

In [37]:
def nll(inp, target): return -inp[range(target.shape[0]), target].mean()

In [38]:
loss = nll(sm_pred, y_train)
loss

tensor(2.30, grad_fn=<NegBackward0>)

In [39]:
test_close(F.nll_loss(F.log_softmax(pred, -1), y_train), loss, 1e-3)

In [40]:
test_close(F.cross_entropy(pred, y_train), loss, 1e-3)