# Softmax Classifier

In [15]:
import numpy as np
import torch
import torchvision
from torchvision import transforms
from torch.autograd import Variable
from torch.utils.data import DataLoader

The Cross Entropy function can compute the loss between two probability distribution, in the case of MNIST, 10-element wide probabilities. between the predicted distributions and the transformed label to one-hot vectors.

The formula of Cross Entropy is as follows:

$$D(\hat{Y}, Y) = -Y * log(\hat{Y})$$

### Cross-Entropy: Numpy Example

In [2]:
# the target.
Y = np.array([1., 0., 0.])

In [3]:
# now let's create fake preds.
Y_pred1 = np.array([.7, .2, .1])
Y_pred2 = np.array([.1, .3, .6])

In [4]:
# let's calculate the loss of the two measurements using cross-entropy.
print('Loss 1: ', np.sum(-Y * np.log(Y_pred1)))

Loss 1:  0.35667494393873245


In [5]:
print('Loss 2: ', np.sum(-Y * np.log(Y_pred2)))

Loss 2:  2.3025850929940455


### Cross-entropy: PyTorch Example

In [6]:
loss = torch.nn.CrossEntropyLoss()

In [7]:
# target is of size nbatch.
# each element in target has to have 0 <= value < nClasses (0-2).
# Input is class not one-hot.
Y = Variable(torch.LongTensor([0]), requires_grad=False)

In [8]:
# input is of size nBatch x nClasses = 1 x 3
# Y_pred are logits (not softmax).
Y_pred1 = Variable(torch.Tensor([[2., 1., .1]]))
Y_pred2 = Variable(torch.Tensor([[.5, 2., .3]]))

In [9]:
l1 = loss(Y_pred1, Y); l1

Variable containing:
 0.4170
[torch.FloatTensor of size 1]

In [10]:
l2 = loss(Y_pred2, Y); l2

Variable containing:
 1.8406
[torch.FloatTensor of size 1]

## MNIST
<img src="MNIST.png" />

### DataLoader

In [45]:
# first we download the MNIST Dataset.
mnist_train = torchvision.datasets.MNIST(root='./data', 
                                         train=True, 
                                         download=True, 
                                         transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((.1307,), (.3081,))]))
mnist_test  = torchvision.datasets.MNIST(root='./data', 
                                         train=False, 
                                         transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((.1307,), (.3081,))]))

In [46]:
# then we load it.
train_loader = DataLoader(dataset=mnist_train, batch_size=128, shuffle=True)
test_loader  = DataLoader(dataset=mnist_test, batch_size=128, shuffle=False)

### The Model

We'll implement this Neural Network Architecture:

<img src="MNIST_NN.png" />

In [47]:
class MNISTClassifier(torch.nn.Module):
    '''
    Classifies the MNIST handwritten digits into their labels.
    Input: 28*28 = 784 Pixel values.
    Output: 10 Probabilities corresponding to the ten labels (0,1,2,3,4,5,6,7,8,9).
    '''
    
    def __init__(self):
        '''
        We construct the architecture of the neural network.
        '''
        super(MNISTClassifier, self).__init__()
        
        # we create the hidden layers.
        self.l1 = torch.nn.Linear(in_features=784, out_features=520)
        self.l2 = torch.nn.Linear(in_features=520, out_features=320)
        self.l3 = torch.nn.Linear(in_features=320, out_features=240)
        self.l4 = torch.nn.Linear(in_features=240, out_features=120)
        self.l5 = torch.nn.Linear(in_features=120, out_features=10)
            
        # we'll use ReLu as an activation function for all layers except the last one.
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        '''
        The Forward process.
        '''
        # before anything, we need to flatten the data.
        x = x.view(-1, 784)
        # we start with the first one because we need x.
        x = self.relu(self.l1(x))
        x = self.relu(self.l2(x))
        x = self.relu(self.l3(x))
        x = self.relu(self.l4(x))
        return self.l5(x)

In [48]:
model = MNISTClassifier()

In [49]:
# let's take a look at the model's components.
model

MNISTClassifier(
  (l1): Linear(in_features=784, out_features=520, bias=True)
  (l2): Linear(in_features=520, out_features=320, bias=True)
  (l3): Linear(in_features=320, out_features=240, bias=True)
  (l4): Linear(in_features=240, out_features=120, bias=True)
  (l5): Linear(in_features=120, out_features=10, bias=True)
  (relu): ReLU()
)

### Loss & Optimizer Definition

In [50]:
criterion = torch.nn.CrossEntropyLoss()

In [51]:
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01, momentum=0.5)

### Training

In [61]:
def train(epochs):
    # set the model in training mode.
    model.train()
    
    for epoch in range(epochs):
        # let's loop over the train_loader batches.
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            if batch_idx % 10 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 
                                                                               batch_idx * len(data), 
                                                                               len(train_loader.dataset), 
                                                                               100. * batch_idx / len(train_loader), 
                                                                               loss.data[0]))

In [62]:
train(1)



After Normalizing and Scaling the Data points, the loss function decreased quite quickly, this is an indication that your should properly scale your values.

And to validate our data, we need to calculate the accuracy over the validation dataset:

In [63]:
def validate():
    
    # sets the model in evaluation mode.
    model.eval()
    
    test_loss = 0
    correct = 0
    
    for data, target in test_loader:
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        
        # sum up batch loss.
        test_loss += criterion(output, target).data[0]
        # get the index of the max log probability.
        pred = torch.max(output.data, 1)[1]
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()
    
    test_loss /= len(test_loader.dataset)
    print('\nValidation set Loss: {:.4f}, accuracy: {}/{} ({:.0f}%)\n'.format(test_loss, 
                                                                              correct, 
                                                                              len(test_loader.dataset), 
                                                                              100. * correct / len(test_loader.dataset)))

In [64]:
validate()


Validation set Loss: 0.0025, accuracy: 9068/10000 (91%)



## Exercice
Build a softmax classifier for the [Otto Group Product](https://www.kaggle.com/c/otto-group-product-classification-challenge) and Use the DataLoader.