### Categorical Cross-Entropy Loss

In [20]:
import math

In [21]:
# An example output from the output layer of the neural network
softmax_output = [0.7, 0.1, 0.2]
# Ground truth
target_output = [1, 0, 0]

loss = - (math.log(softmax_output[0]) * target_output[0] +
          math.log(softmax_output[1]) * target_output[1] +
          math.log(softmax_output[2]) * target_output[2])
loss

0.35667494393873245

#### Log to the base e or ln

In [22]:
import numpy as np

In [23]:
b= 5.2
print(np.log(5.2))

1.6486586255873816


In [24]:
# We can confirm this by exponentiating our reslut:
print(math.e ** 1.6486586255873816)

5.199999999999999


#### Work on batches and Make negative log calculation dynamic to the target index
let's assume the target class is sparse means 0 represent dog, 1 cat and 2 human
The target classes is not one hot encoded in which the array is 2D and the correct label is represented by 1 while the rest with 0

In [32]:
softmax_outputs = [[0.7, 0.1, 0.2],
                  [0.1, 0.5, 0.4],
                  [0.02, 0.9, 0.08]]

class_targets = [0, 1, 1]

for targ_idx, distribution in zip(class_targets, softmax_outputs):
    print(distribution[targ_idx])

0.7
0.5
0.9


#### Using Numpy

In [36]:
oftmax_outputs = np.array([[0.7, 0.1, 0.2],
                           [0.1, 0.5, 0.4],
                           [0.02, 0.9, 0.08]])

class_targets = [0, 1, 1]
print(softmax_outputs[[0, 1, 2], class_targets])

[0.7 0.5 0.9]


#### Use arange instead of the hard coded indices

In [35]:
# Example softmax outputs (as a NumPy array)
softmax_outputs = np.array([[0.7, 0.1, 0.2],
                            [0.1, 0.5, 0.4],
                            [0.02, 0.9, 0.08]])

# Corresponding class targets (as a NumPy array)
class_targets = np.array([0, 1, 1])

# Using array indexing to get the values
output_values = softmax_outputs[range(len(class_targets)), class_targets]

# Print the extracted values
print(output_values)

[0.7 0.5 0.9]


#### Now apply Negative Log

In [37]:
print(-np.log(softmax_outputs[range(len(softmax_outputs)), class_targets]))

[0.35667494 0.69314718 0.10536052]


#### Find average loss per batch

In [38]:
neg_log = -np.log(softmax_outputs[
    range(len(softmax_outputs)), class_targets
])

average_loss = np.mean(neg_log)
print(average_loss)

0.38506088005216804


#### Generalize for the One-Hot Encoded Labels and Sparse (Categorical) Labels

In [39]:
softmax_outputs = np.array([[0.7, 0.1, 0.2],
                            [0.1, 0.5, 0.4],
                            [0.02, 0.9, 0.08]])

class_targets = np.array([[1, 0, 0],
 [0, 1, 0],
 [0, 1, 0]])


# Probabilities for target values only if categoricals labels (sparse)
if len(class_targets.shape) == 1:
    correct_confidences = softmax_output[
        range(len(softmax_outputs)),
        class_targets
    ]
    
# Mask values - Only for one-hot encoded labels
elif len(class_targets.shape) == 2:
    correct_confidences = np.sum(
    softmax_outputs * class_targets,
    axis = 1)
    
# Losses
neg_log = -np.log(correct_confidences)

average_loss = np.mean(neg_log)
print(average_loss)

0.38506088005216804


#### log(0)
if model predict 0 for a given class category and the actual label is true then what will be the cross entropy loss

In [40]:
print(-np.log(0))

inf


  print(-np.log(0))


In [41]:
print(np.e**(-np.inf))

0.0


If there is an infinity in the list and we want the mean error then the result will be infinity

In [42]:
print(np.mean([1, 2, 3, -np.log(0)]))

inf


  print(np.mean([1, 2, 3, -np.log(0)]))


We could add a very small value to the confidence to prevent it from being a zero, for example, 
1e-7:

In [43]:
print(-np.log(1e-7))

16.11809565095832


Adding a very small value, one-tenth of a million, to the confidence at its far edge will 
insignificantly impact the result, but this method yields an additional 2 issues.

In [44]:
print(-np.log(1+1e-7))

-9.999999505838704e-08


When the model is fully correct in a prediction and puts all the confidence in the correct label, 
loss becomes a negative value instead of being 0. The other problem here is shifting confidence 
towards 1, even if by a very small value.

To prevent both issues, it’s better to clip values from both 
sides by the same number, 1e-7 in our case. That means that the lowest possible value will become 
1e-7 (like in the demonstration we just performed) but the highest possible value, instead of being 
1+1e-7, will become 1-1e-7 (so slightly less than 1):

In [45]:
print(-np.log(1-1e-7))

1.0000000494736474e-07


This will prevent loss from being exactly 0, making it a very small value instead, but won’t make 
it a negative value and won’t bias overall loss towards 1. Within our code and using numpy, we’ll 
accomplish that using np.clip() method:

In [46]:
y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

NameError: name 'y_pred' is not defined

In [47]:
softmax_outputs = np.array([[0.7, 0.1, 0.2],
                            [0.1, 0.5, 0.4],
                            [0.02, 0.9, 0.08]])

y_pred_clipped = np.clip(softmax_outputs, 1e-7, 1 - 1e-7)
print(y_pred_clipped)

[[0.7  0.1  0.2 ]
 [0.1  0.5  0.4 ]
 [0.02 0.9  0.08]]


### Common Loss Class

In [48]:
class Loss:
    
    # Calculate the data and the regularization losses given the model output and ground truth values
    def calculate(self, output, y):
        
        # Calculate sample losses
        sample_losses = self.forward(output, y)
        
        # Calculate mean loss
        data_loss = np.mean(sample_losses)
        
        # Return loss
        return data_loss

### Create Cross-entropy loss class

In [49]:
class Loss_CategoricalCrossentropy(Loss): # Inhirits Loss class
    
    
    # Forward pass
    def forward(self, y_pred, y_true):
        
        # Number of samples in a bathc
        samples = len(y_pred)
        
        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        
        # Probabilities for target values - only if categorical
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
                range(samples),
                y_true
            ]
        
        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
            y_pred_clipped * y_true,
            axis = 1
            )
            
        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

Using the manually-created 
output and targets:

In [50]:
loss_function = Loss_CategoricalCrossentropy()
loss = loss_function.calculate(softmax_outputs, class_targets)
print(loss)

0.38506088005216804


### Combining everything up to this point

In [51]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

#### Dense Layer

In [52]:
class Layer_Dense:
    
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
    
    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases

####  ReLU activation

In [58]:
class Activation_ReLU:
    
    def forward(self, inputs):
        self.output = np.maximum(0,inputs)

#### Softmax Activation

In [54]:
class Activation_Softmax:
    
    def forward(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis = 1, keepdims = True))
        probabilities = exp_values / np.sum(exp_values, axis = 1, keepdims = True)
        self.output = probabilities

#### Loss

In [55]:
class Loss:
    
    def calculate(self, output, y):
        sample_losses = self.forward(output, y)
        data_loss = np.mean(sample_losses)
        return data_loss

#### Cross-entropy Loss

In [56]:
class Loss_Categorical_Crossentropy(Loss):
    def forward(self, y_pred, y_true):
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        samples = len(y_pred)
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
                range(samples),
                y_true
            ]
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true, axis = 1)
        negative_log_likelihoods = - np.log(correct_confidences)
        return negative_log_likelihoods

#### Create Dataset

In [61]:
X, y = spiral_data(samples = 100, classes = 3)
dense1 = Layer_Dense(2,3)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(3,3)
activation2 = Activation_Softmax()
loss_function = Loss_Categorical_Crossentropy()

dense1.forward(X)
activation1.forward(dense1.output)

dense2.forward(activation1.output)
activation2.forward(dense2.output)

print(activation2.output[:5])

loss = loss_function.calculate(activation2.output, y)
print('Loss: ',loss)

[[0.33333334 0.33333334 0.33333334]
 [0.33333343 0.33333355 0.33333296]
 [0.33333376 0.33333385 0.3333323 ]
 [0.333334   0.3333343  0.3333318 ]
 [0.33333403 0.33333457 0.3333315 ]]
Loss:  1.098608


### Accuracy Calculation

In [63]:
# Probabilities of 3 samples
softmax_outputs = np.array([[0.7, 0.2, 0.1],
                            [0.5, 0.1, 0.4],
                            [0.02, 0.9, 0.08]])

class_targets = np.array([0, 1, 1])

predictions = np.argmax(softmax_outputs, axis = 1)

# if class targets are one-hot encoded labels - convert them
if len(class_targets.shape) == 2:
    class_targets = np.argmax(class_target, axis = 1)

accuracy = np.mean(predictions == class_targets)
print('acc', accuracy)

acc 0.6666666666666666


We can add the following to the end of our full script above to calculate its accuracy

In [64]:
predictions = np.argmax(activation2.output, axis = 1)
if len(y.shape) == 2:
    y = np.argmax(y, axis = 1)

accuracy = np.mean(predictions == y)
print('acc', accuracy)

acc 0.35
