# Softmax & Cross-Entropy

In [1]:
import torch
import torch.nn as nn
import numpy as np

## Softmax

Softmax applies the exponential function to each element, and normalizes by dividing by the sum of all these exponentials -> squashes the output to be between 0 and 1 = probability. Sum of all probabilities is 1.

$S(y_i) = {\LARGE\frac{e^{y_i}}{\Sigma e^{y_j}}}$

#### NumPy

In [2]:
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [3]:
x = np.array([2.0, 1.0, 0.1])
outputs = softmax(x)
print(f"Softmax of {x} is {outputs}")

Softmax of [2.  1.  0.1] is [0.65900114 0.24243297 0.09856589]


#### PyTorch

In [4]:
x = torch.tensor([2.0, 1.0, 0.1])
outputs = torch.softmax(x, dim=0)
print(f"Softmax of {x} is {outputs}")

Softmax of tensor([2.0000, 1.0000, 0.1000]) is tensor([0.6590, 0.2424, 0.0986])


## Cross-Entropy
Cross-entropy loss, or log loss, measures the performance of a classification model whose output is a probability value between 0 and 1 -> loss increases as the predicted probability diverges from the actual label.

<img src="images/6.jpg" width=600>

#### Numpy

In [5]:
def cross_entropy(actual, predicted):
    loss = -np.sum(actual * np.log(predicted))
    return loss # /float(predicted.shape[0])

# y must be one-hot encoded
# if class 0 is predicted, then y[0] = 1, y[1] = 0, y[2] = 0
y = np.array([[1, 0, 0]])

# y_pred has probabilities for each class
y_pred_good = np.array([0.7, 0.2, 0.1])
y_pred_bad = np.array([0.1, 0.3, 0.6])

l1 = cross_entropy(y, y_pred_good)
l2 = cross_entropy(y, y_pred_bad)
print(f"Loss1 numpy: {l1}")
print(f"Loss2 numpy: {l2}")

Loss1 numpy: 0.35667494393873245
Loss2 numpy: 2.3025850929940455


#### PyTorch
nn.CrossEntropyLoss applies nn.LogSoftMax + nn.NLLLoss (Negative Likelihood Loss), thus no softmax is required in last layer!  
Y has class labels, not One-Hot!  
Y_pred has raw scores (logits), no Softmax!

In [6]:
loss = nn.CrossEntropyLoss()

y = torch.tensor([0])
# nsamples x nclasses = 1 x 3
y_pred_good = torch.tensor([[2.0, 1.0, 0.1]])
y_pred_bad = torch.tensor([[0.5, 2.0, 0.3]])

l1 = loss(y_pred_good, y)
l2 = loss(y_pred_bad, y)

print(f"Loss1 torch: {l1.item()}")
print(f"Loss2 torch: {l2.item()}")

Loss1 torch: 0.4170299470424652
Loss2 torch: 1.840616226196289


In [7]:
_, prediction1 = torch.max(y_pred_good, 1)
_, prediction2 = torch.max(y_pred_bad, 1)

print(f"Prediction1 torch: {prediction1.item()}")
print(f"Prediction2 torch: {prediction2.item()}")

Prediction1 torch: 0
Prediction2 torch: 1


#### PyTorch allows batch loss for multiple samples

In [8]:
# target is of size nBatch = 3
# each element has class label: 0, 1, or 2
Y = torch.tensor([2, 0, 1])

# input is of size nBatch x nClasses = 3 x 3
# Y_pred are logits (not softmax)
Y_pred_good = torch.tensor(
    [[0.1, 0.2, 3.9], # predict class 2
    [1.2, 0.1, 0.3], # predict class 0
    [0.3, 2.2, 0.2]]) # predict class 1

Y_pred_bad = torch.tensor(
    [[0.9, 0.2, 0.1],
    [0.1, 0.3, 1.5],
    [1.2, 0.2, 0.5]])

l1 = loss(Y_pred_good, Y)
l2 = loss(Y_pred_bad, Y)
print(f'Batch Loss1:  {l1.item():.4f}')
print(f'Batch Loss2: {l2.item():.4f}')

# get predictions
_, predictions1 = torch.max(Y_pred_good, 1)
_, predictions2 = torch.max(Y_pred_bad, 1)
print(f'Actual class: {Y}, Y_pred1: {predictions1}, Y_pred2: {predictions2}')

Batch Loss1:  0.2834
Batch Loss2: 1.6418
Actual class: tensor([2, 0, 1]), Y_pred1: tensor([2, 0, 1]), Y_pred2: tensor([0, 2, 0])


## Binary Classification Example

In [9]:
class NeuralNet1(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNet1, self).__init__()
        self.liner1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        out = self.liner1(x)
        out = self.relu(out)
        out = self.linear2(out)
        # sigmoid is applied to convert logits to probabilities
        out = torch.sigmoid(out)
        return out

model = NeuralNet1(input_size=28*28, hidden_size=5)
criterion = nn.BCELoss()

## Multiclass Classification Example

In [10]:
class NeuralNet2(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet2, self).__init__()
        self.liner1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out = self.liner1(x)
        out = self.relu(out)
        out = self.linear2(out)
        # No softmax here
        return out

model = NeuralNet2(input_size=28*28, hidden_size=5, num_classes=3)
criterion = nn.CrossEntropyLoss() # applies softmax internally