In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [4]:
from pathlib import Path
from IPython.core.debugger import set_trace
from fastai import datasets
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor, nn 
import torch.nn.functional as F

In [7]:
MNIST_URL='http://deeplearning.net/data/mnist/mnist.pkl'
def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): return (x-m)/s

def stats(x): return x.mean(),x.std()


In [8]:
#MNIST datasetup
mpl.rcParams['image.cmap'] = 'gray'
x_train,y_train,x_valid,y_valid = get_data()

n,m = x_train.shape
c = y_train.max()+1
number_hid = 50

In [16]:
class Model(nn.Module): #simple 3 layer Model
    def __init__(self, num_inputs, num_hidden, num_outputs):
            super().__init__() #initalize nn.Module
            self.layers = [nn.Linear(num_inputs, num_hidden),
                           nn.ReLU(),
                           nn.Linear(num_hidden, num_outputs)
                          ]
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [17]:
model = Model(m, number_hid, 10)
pred = model(x_train)


$$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{\sum_{0 \leq j \leq n-1} e^{x_{j}}}$$ 


$$\hbox{Cross Entropy Loss}$$ $$-\sum x\, \log p(x)$$

In [33]:
def log_softmax_standard(x): 
    return (x.exp()/(x.exp().sum(-1, keepdim=True))).log()

def log_softmax_simplified(x):
    return x - x.exp().sum(-1, keepdim=True).log()

#pytorch one-hot encoded Negative-Log Likelyhood
#indexs by actuals, using numpy integer array indexing over the number of rows
def one_hot_encoded_nll(input, target):
    return -input[range(target.shape[0]), target].mean()
    

In [34]:
sm_pred = log_softmax_simplified(pred)

loss = one_hot_encoded_nll(sm_pred, y_train)

In [35]:
print("Loss of model: %f"%(loss))

Loss of model: 2.295798


$$\hbox{Implementing the LogSumExp Trick}$$
$$\log \left ( \sum_{j=1}^{n} e^{x_{j}} \right ) = \log \left ( e^{a} \sum_{j=1}^{n} e^{x_{j}-a} \right ) = a + \log \left ( \sum_{j=1}^{n} e^{x_{j}-a} \right )$$

In [47]:
#this allows us to find our largest input, subtract it from the other inputs, and then add it back  in
#this gives us the same result without worrying about e^(x) overflowing
def logsumexp(x):
    m = x.max(-1)[0]
    return m + (x-m[:, None]).exp().sum(-1).log()

def log_softmax(x):
    return x - x.logsumexp(-1, keepdim=True)

#using softmax with nll
loss = one_hot_encoded_nll(log_softmax(pred), y_train)
pytorch_loss = F.nll_loss(F.log_softmax(pred, -1), y_train)
print("Loss: %f\nPytorch Loss: %f"%(loss,pytorch_loss))

Loss: 2.295799
Pytorch Loss: 2.295807


In [59]:
def accuracy(out, yb):
    return (torch.argmax(out, dim=1) == yb).float().mean()

loss_function = F.cross_entropy
batch_size = 64
learning_rate = 0.5
num_epochs = 1



x_mini_batch = x_train[0:batch_size]
preds = model(mini_batch)
preds[0], preds.shape #64 batchsize * 10 categories

y_mini_batch = y_train[0:batch_size]
loss_function(preds, yb)
accuracy(preds, yb) #terrible, which is expected for an untrained model randomly guessing

tensor(0.9688)

----
a basic training loop

In [60]:
for epoch in range(num_epochs):
    for i in range((n-1)//batch_size +1):
        start_index = i *batch_size
        xmb = x_train[start_index: start_index + batch_size]
        ymb = y_train[start_index: start_index + batch_size]
        predictions = model(xmb)
        loss = loss_function(predictions, ymb)
        loss.backward() #calculate the gradients
        with torch.no_grad():
            for l in model.layers: #only updating layers with parameters, ie: not ReLUs
                if hasattr(l, 'weight'):
                    l.weight -= l.weight.grad * learning_rate
                    l.bias -= l.bias.grad * learning_rate
                    #zero the matricies' gradients
                    l.weight.grad.zero_()
                    l.bias.grad.zero_()
loss_function(model(mini_batch), y_mini_batch), accuracy(model(x_mini_batch), y_mini_batch)

(tensor(0.0906, grad_fn=<NllLossBackward>), tensor(0.9844))

---
Implementing Pytorch model.Parameters

adding capability to store list of attributes in our modules
```
class Module():
    def __init__(self, num_inputs, num_hidden, num_outputs):
        self._modules = {} #empty set to store our modules in
        self.layer1 = nn.Linear(num_inputs, num_hidden)
        self.layer2 = nn.Linear(num_hidden, num_outputs) #store layers in a list, this is inflexible
        
    def __setattr__(self, k,v):
        if not k.startswith("_"):
            self._modules[k] = v
            super().__setattr__(k,v) #using python object's __setattr__

    def __repr__(self):
        return f'{self._modules}'

    def parameters(self):
        for l in self._modules.values():
            for p in l.parameters():
                yield p
```

In [85]:
#plain example
class DumberModel(nn.Module):
    def __init__(self, num_inputs, num_hidden, num_outputs):
        super().__init__()
        self.layer1 = nn.Linear(num_inputs, num_hidden)
        self.layer2 = nn.Linear(num_hidden, num_outputs)
    def __call__(self, x):
        return self.layer2(F.relu(self.layer1(x)))

class DumbModel(nn.Module):
    def __init__(self,layers):
        super().__init__()
        self.layers = layers
        for i, l in enumerate(self.layers): #add each layer as a Pytorch Module
            self.add_module(f'layer_{i}', l) #name_index and the layer itself
    def __call__(self, x):
        return self.layer2(F.relu(self.layer1(x)))

    
#sequential model example
class SequentialModel(nn.Module): #implementation of nn.Sequential
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)
    
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [81]:
def fit():
    for epoch in range(num_epochs):
        for i in range((n-1)//batch_size +1):
            start_index = i * batch_size
            x_mb = x_train[start_index:start_index + batch_size]
            y_mb = y_train[start_index:start_index + batch_size]
            predictions = model(x_mb)
            loss = loss_function(predictions, y_mb)
            
            loss.backward()
            with torch.no_grad():
                for parameter in model.parameters():
                    parameter -= parameter.grad * learning_rate
                model.zero_grad()

---
# Using the Sequential Layer Model

In [72]:
num_categories = 10
layers = [nn.Linear(m, number_hid), nn.ReLU(), nn.Linear(number_hid, num_categories)]
model = SequentialModel(layers)
print(model)

SequentialModel(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=10, bias=True)
  )
)


In [84]:
loss_function = F.cross_entropy
batch_size = 64
learning_rate = 0.5
num_epochs = 1
fit()
loss_function(model(x_mini_batch), y_mini_batch), accuracy(model(x_mini_batch), y_mini_batch)

(tensor(0.1041, grad_fn=<NllLossBackward>), tensor(0.9531))

# Using Pytorch's nn.Sequential

In [87]:
model = nn.Sequential(nn.Linear(m, number_hid), nn.ReLU(), nn.Linear(number_hid, num_categories))
fit()
loss_function(model(x_mini_batch), y_mini_batch), accuracy(model(x_mini_batch), y_mini_batch)

(tensor(0.1571, grad_fn=<NllLossBackward>), tensor(0.9375))