In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor,nn
from fastcore.test import test_close
import torch.nn.functional as F
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

In [2]:
MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'

In [3]:
from urllib.request import urlretrieve
if not path_gz.exists(): urlretrieve(MNIST_URL, path_gz)

In [4]:
path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

In [5]:
n,m = x_train.shape
c = y_train.max()+1
nh = 50

In [6]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
        
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [7]:
model = Model(m, nh, 10)
pred = model(x_train)
pred.shape

torch.Size([50000, 10])

In [8]:
math.log?

[31mDocstring:[39m
log(x, [base=math.e])
Return the logarithm of x to the given base.

If the base is not specified, returns the natural logarithm (base e) of x.
[31mType:[39m      builtin_function_or_method

In [9]:
pred, pred.exp()

(tensor([[-0.05,  0.03,  0.22,  ..., -0.12, -0.15,  0.20],
         [-0.07,  0.05,  0.21,  ..., -0.16, -0.12,  0.19],
         [-0.04, -0.02,  0.15,  ..., -0.14, -0.19,  0.15],
         ...,
         [ 0.08,  0.08,  0.21,  ..., -0.03, -0.19,  0.16],
         [-0.07,  0.02,  0.14,  ..., -0.06, -0.10,  0.21],
         [-0.07,  0.08,  0.18,  ..., -0.05, -0.13,  0.09]], grad_fn=<AddmmBackward0>),
 tensor([[0.95, 1.03, 1.25,  ..., 0.89, 0.86, 1.22],
         [0.94, 1.05, 1.24,  ..., 0.85, 0.89, 1.21],
         [0.96, 0.98, 1.16,  ..., 0.87, 0.83, 1.16],
         ...,
         [1.08, 1.09, 1.23,  ..., 0.97, 0.83, 1.18],
         [0.93, 1.02, 1.15,  ..., 0.94, 0.90, 1.24],
         [0.93, 1.08, 1.19,  ..., 0.95, 0.88, 1.10]], grad_fn=<ExpBackward0>))

In [10]:
def log_softmax(x):
    return (x.exp() / x.exp().sum(dim=1, keepdim=True)).log()

In [11]:
pred.exp().sum(dim=1, keepdim=True).shape, pred.exp().shape

(torch.Size([50000, 1]), torch.Size([50000, 10]))

In [12]:
log_softmax(pred), log_softmax(pred).shape

(tensor([[-2.36, -2.28, -2.09,  ..., -2.43, -2.47, -2.11],
         [-2.37, -2.25, -2.09,  ..., -2.46, -2.43, -2.11],
         [-2.34, -2.31, -2.14,  ..., -2.44, -2.48, -2.14],
         ...,
         [-2.26, -2.25, -2.13,  ..., -2.36, -2.53, -2.17],
         [-2.39, -2.30, -2.18,  ..., -2.38, -2.42, -2.11],
         [-2.40, -2.25, -2.14,  ..., -2.38, -2.45, -2.23]], grad_fn=<LogBackward0>),
 torch.Size([50000, 10]))

In [13]:
def log_softmax_simplified(x):
    return x - x.exp().sum(dim=1, keepdim=True).log()

In [14]:
test_close(log_softmax(pred), log_softmax_simplified(pred))

In [15]:
pred.max(dim=1), pred.max(dim=1)[0].shape

(torch.return_types.max(
 values=tensor([0.22, 0.21, 0.15,  ..., 0.21, 0.21, 0.18], grad_fn=<MaxBackward0>),
 indices=tensor([2, 2, 4,  ..., 2, 9, 2])),
 torch.Size([50000]))

In [16]:
pred.max(dim=1, keepdim=True)[0], pred.max(dim=1, keepdim=True)[0].shape, pred.max(dim=1)[0].unsqueeze(dim=1).shape

(tensor([[0.22],
         [0.21],
         [0.15],
         ...,
         [0.21],
         [0.21],
         [0.18]], grad_fn=<MaxBackward0>),
 torch.Size([50000, 1]),
 torch.Size([50000, 1]))

In [17]:
def logsumexp(x):
    c = x.max(dim=1)[0]
    return c + (x - c.unsqueeze(dim=1)).exp().sum(dim=1).log()

In [18]:
(pred - pred.max(dim=1, keepdim=True)[0]).exp().sum(dim=1).log().shape

torch.Size([50000])

In [19]:
pred.logsumexp(-1).shape

torch.Size([50000])

In [20]:
test_close(logsumexp(pred), pred.logsumexp(-1))

In [21]:
def log_softmax_w_sumexp(x):
    return x - logsumexp(x).unsqueeze(dim=1)

In [22]:
test_close(log_softmax_w_sumexp(pred), nn.LogSoftmax(dim=1)(pred))

In [23]:
sm_pred = log_softmax_w_sumexp(pred)
sm_pred

tensor([[-2.36, -2.28, -2.09,  ..., -2.43, -2.47, -2.11],
        [-2.37, -2.25, -2.09,  ..., -2.46, -2.43, -2.11],
        [-2.34, -2.31, -2.14,  ..., -2.44, -2.48, -2.14],
        ...,
        [-2.26, -2.25, -2.13,  ..., -2.36, -2.53, -2.17],
        [-2.39, -2.30, -2.18,  ..., -2.38, -2.42, -2.11],
        [-2.40, -2.25, -2.14,  ..., -2.38, -2.45, -2.23]], grad_fn=<SubBackward0>)

In [24]:
y_train[:3]

tensor([5, 0, 4])

In [25]:
sm_pred[0,5],sm_pred[1,0],sm_pred[2,4]

(tensor(-2.40, grad_fn=<SelectBackward0>),
 tensor(-2.37, grad_fn=<SelectBackward0>),
 tensor(-2.14, grad_fn=<SelectBackward0>))

In [26]:
sm_pred[[0,1,2], y_train[:3]]

tensor([-2.40, -2.37, -2.14], grad_fn=<IndexBackward0>)

In [27]:
sm_pred.shape, y_train.shape

(torch.Size([50000, 10]), torch.Size([50000]))

In [28]:
F.one_hot(y_train).shape, F.one_hot(y_train).T.shape

(torch.Size([50000, 10]), torch.Size([10, 50000]))

In [29]:
F.one_hot(y_train).T.float()

tensor([[0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [30]:
sm_pred.shape, sm_pred[:, torch.ones(50000).int()].shape

(torch.Size([50000, 10]), torch.Size([50000, 50000]))

In [31]:
def nll(pred, y):
    return -(pred * F.one_hot(y).float()).sum()/y.shape[0]

In [32]:
loss = nll(sm_pred, y_train)
loss

tensor(2.30, grad_fn=<DivBackward0>)

In [33]:
loss.shape

torch.Size([])

In [34]:
loss, F.nll_loss(F.log_softmax(pred, -1), y_train)

(tensor(2.30, grad_fn=<DivBackward0>),
 tensor(2.30, grad_fn=<NllLossBackward0>))

In [35]:
test_close(F.nll_loss(F.log_softmax(pred, -1), y_train), loss, 1e-3)

In [36]:
range(y_train.shape[0]), y_train

(range(0, 50000), tensor([5, 0, 4,  ..., 8, 4, 8]))

In [37]:
def nll_via_indexing(pred, y):
    return -pred[range(y.shape[0]), y].mean()

In [38]:
test_close(nll(sm_pred, y_train), nll_via_indexing(sm_pred, y_train))

In [39]:
test_close(F.cross_entropy(pred, y_train), loss, 1e-3)

In [40]:
sm_pred[:, y_train].shape, sm_pred[range(sm_pred.shape[0]), y_train].shape

(torch.Size([50000, 50000]), torch.Size([50000]))

Above seems to need two lists two activate zipping/parallel indexing, as opposed to using `:` which activates broadcasting.

In [41]:
loss_func = F.cross_entropy

In [42]:
bs=50                  # batch size

xb = x_train[0:bs]     # a mini-batch from x
preds = model(xb)      # predictions
preds[0], preds.shape

(tensor([-0.05,  0.03,  0.22,  0.02,  0.00, -0.09, -0.04, -0.12, -0.15,  0.20], grad_fn=<SelectBackward0>),
 torch.Size([50, 10]))

In [43]:
yb = y_train[0:bs]
yb

tensor([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9, 4, 0, 9, 1, 1, 2, 4, 3, 2, 7, 3, 8, 6, 9, 0, 5, 6, 0, 7,
        6, 1, 8, 7, 9, 3, 9, 8, 5, 9, 3])

In [44]:
loss_func(preds, yb)

tensor(2.28, grad_fn=<NllLossBackward0>)