In [1]:
import numpy as np

In [2]:
# Forward pass
x = [1.0, -2.0, 3.0] # input values
w = [-3.0, -1.0, 2.0] # weights
b = 1.0 # bias

# Multiplying inputs by weights
xw0 = x[0] * w[0]
xw1 = x[1] * w[1]
xw2 = x[2] * w[2]
print(xw0, xw1, xw2, b)

# Adding weighted inputs and a bias
z = xw0 + xw1 + xw2 + b
print(z)

# ReLU activation function
y = max(z, 0)
print(y)

-3.0 2.0 6.0 1.0
6.0
6.0


#### derivative of the ReLU

In [3]:
dvalue = 1 # derivative of the next layer
drelu_dz = dvalue * (1. if z > 0 else 0.)
print(drelu_dz)

1.0


Moving backward through our neural network, what is the function that comes immediately 
before we perform the activation function?

It’s a sum of the weighted inputs and bias. This means that we — want to calculate the partial 
derivative of the sum function, and then, using the chain rule, multiply this by the partial 
derivative of the subsequent, outer, function, which is ReLU.

- `drelu_dxw0` — the partial derivative of the ReLU w.r.t. the first weighed input, w0
x0
,
- `drelu_dxw1` — the partial derivative of the ReLU w.r.t. the second weighed input, w1
x1
,
- `drelu_dxw2` — the partial derivative of the ReLU w.r.t. the third weighed input, w2
x2
,
- `drelu_db` — the partial derivative of the ReLU with respect to the bias, b.

In [4]:
dsum_dxw0 = 1
drelu_dxw0 = drelu_dz * dsum_dxw0
print(drelu_dxw0)

1.0


In [5]:
dsum_dxw1 = 1
drelu_dxw1 = drelu_dz * dsum_dxw1
print(drelu_dxw1)

1.0


In [6]:
dsum_dxw2 = 1
drelu_dxw2 = drelu_dz * dsum_dxw2
print(drelu_dxw2)

1.0


In [7]:
dsum_db = 1
drelu_db = drelu_dz * dsum_db
print(drelu_db)

1.0


Let’s add these partial derivatives, with the applied chain rule, to our code

In [8]:
# Forward pass
x = [ 1.0, -2.0, 3.0] # input values
w = [-3.0, -1.0, 20]  # weights
b = 1.0 # bias

# Multiplying inputs by weights
xw0 = x[0] * w[0]
xw1 = x[1] * w[1]
xw2 = x[2] * w[2]

# Adding weighted input and bias
z = xw0 + xw1 + xw2 + b

# ReLU activation function
y = max(z, 0)

# Backward pass

# The derivative from the next layer
dvalue = 1

# Derivative of the ReLU and the chain rule
drelu_dz = dvalue * (1. if z >  0 else 0)
print(drelu_dz)

# Partial derivatives of the multiplication, the chain rule
dsum_dxw0 = 1
dsum_dxw1 = 1
dsum_dxw2 = 1
dsum_db = 1

drelu_dxw0 = drelu_dz * dsum_dxw0
drelu_dxw1 = drelu_dz * dsum_dxw1
drelu_dxw2 = drelu_dz * dsum_dxw2
drelu_db = drelu_dz * dsum_db

print(drelu_dxw0, drelu_dxw1, drelu_dxw2, drelu_db)

1.0
1.0 1.0 1.0 1.0


Continuing backward, the function that comes before the sum is the multiplication of weights and 
inputs. The derivative for a product is whatever the input is being multiplied by

In [9]:
# Partial derivatives of the multiplication, the chain rule
dmul_dx0 = w[0]
drelu_dx0 = drelu_dxw0 * dmul_dx0
print(drelu_dx0)

-3.0


We perform the same operation for other inputs and weights

In [10]:
# Forward pass
x = [ 1.0, -2.0, 3.0] # input values
w = [-3.0, -1.0, 2.0]  # weights
b = 1.0 # bias

# Multiplying inputs by weights
xw0 = x[0] * w[0]
xw1 = x[1] * w[1]
xw2 = x[2] * w[2]

# Adding weighted input and bias
z = xw0 + xw1 + xw2 + b

# ReLU activation function
y = max(z, 0)

# Backward pass

# The derivative from the next layer
dvalue = 1

# Derivative of the ReLU and the chain rule
drelu_dz = dvalue * (1. if z >  0 else 0)
print(drelu_dz)

# Partial derivatives of the multiplication, the chain rule
dsum_dxw0 = 1
dsum_dxw1 = 1
dsum_dxw2 = 1
dsum_db = 1

drelu_dxw0 = drelu_dz * dsum_dxw0
drelu_dxw1 = drelu_dz * dsum_dxw1
drelu_dxw2 = drelu_dz * dsum_dxw2
drelu_db = drelu_dz * dsum_db

print(drelu_dxw0, drelu_dxw1, drelu_dxw2, drelu_db)

# Partial derivatives of the multiplication, the chain rule
dmul_dx0 = w[0]
dmul_dx1 = w[1]
dmul_dx2 = w[2]

dmul_dw0 = x[0]
dmul_dw1 = x[1]
dmul_dw2 = x[2]

drelu_dx0 = drelu_dxw0 * dmul_dx0
drelu_dx1 = drelu_dxw1 * dmul_dx1
drelu_dx2 = drelu_dxw2 * dmul_dx2

drelu_dw0 = drelu_dxw0 * dmul_dw0
drelu_dw1 = drelu_dxw1 * dmul_dw1
drelu_dw2 = drelu_dxw2 * dmul_dw2

print(drelu_dx0, drelu_dw0, drelu_dx1, drelu_dw1, drelu_dx2, drelu_dw2)

1.0
1.0 1.0 1.0 1.0
-3.0 1.0 -1.0 -2.0 2.0 3.0


All together, the partial derivatives above, combined into a vector, make up our gradients. Our 
gradients could be represented as:

In [11]:
dx = [drelu_dx0, drelu_dx1, drelu_dx2] # Gradient on inputs
dw = [drelu_dw0, drelu_dw1, drelu_dw2] # Gradient on weights
db = drelu_db # Gradient on bias ... just 1 bias here

We apply a negative fraction to this gradient since 
we want to decrease the final output value, and the gradient shows the direction of the steepest 
ascent.

In [12]:
print(w, b)

[-3.0, -1.0, 2.0] 1.0


In [13]:
w[0] += -0.001 * dw[0]
w[1] += -0.001 * dw[1]
w[2] += -0.001 * dw[2]
b += -0.001 * db

print(w, b)

[-3.001, -0.998, 1.997] 0.999


Now, we’ve slightly changed the weights and bias in such a way so as to decrease the output 
somewhat intelligently. We can see the effects of our tweaks on the output by doing another 
forward pass:

In [14]:
# Multiplying inputs by weights
xw0 = x[0] * w[0]
xw1 = x[1] * w[1]
xw2 = x[2] * w[2]

# Adding
z = xw0 + xw1 + xw2 + b

# ReLU activation function
y = max(z, 0)
print(y)

5.985


We’ve successfully decreased this neuron’s output from 6.000 to 5.985.

During backpropagation, each neuron from the 
current layer will receive a vector of partial derivatives the same way that we described for a 
single neuron. With a layer of neurons, it’ll take the form of a list of these vectors, or a 2D array.

To calculate the partial derivatives with respect to inputs, we need the weights — the partial 
derivative with respect to the input equals the related weight. This means that the array of 
partial derivatives with respect to all of the inputs equals the array of weights. Since this array is 
transposed, we’ll need to sum its rows instead of columns. 

In the code to show this, we take the transposed weights, which are the transposed array of the 
derivatives with respect to inputs, and multiply them by their respective gradients (related to 
given neurons) to apply the chain rule. Then we sum along with the inputs. Then we calculate 
the gradient for the next layer in backpropagation. The “next” layer in backpropagation is the 
previous layer in the order of creation of the model

In [15]:
# Passed-in gradient from the next layer
# for the purpose of this example we're going to use
# a vector of 1s
dvalues = np.array([[1., 1., 1.]])

# We have 3 sets of weights - one set for each neuron
# we have 4 inputs, thus 4 weights
# recall that we keep weights transposed
weights = np.array([[0.2, 0.8, -0.5, 1],
                    [0.5, -0.91, 0.26, -0.5],
                    [-0.26, -0.27, 0.17, 0.87]]).T

# Sum weights related to the given input multiplied by
# the gradient related to the given neuron
dx0 = sum([weights[0][0]*dvalues[0][0], weights[0][1]*dvalues[0][1],
           weights[0][2]*dvalues[0][2]])
dx1 = sum([weights[1][0]*dvalues[0][0], weights[1][1]*dvalues[0][1],
           weights[1][2]*dvalues[0][2]])
dx2 = sum([weights[2][0]*dvalues[0][0], weights[2][1]*dvalues[0][1],
           weights[2][2]*dvalues[0][2]])
dx3 = sum([weights[3][0]*dvalues[0][0], weights[3][1]*dvalues[0][1],
           weights[3][2]*dvalues[0][2]])

dinputs = np.array([dx0, dx1, dx2, dx3])
print(dinputs)

[ 0.44 -0.38 -0.07  1.37]


From NumPy’s perspective, and since both weights and dvalues are NumPy arrays, we 
can simplify the dx0 to dx3 calculation. Since the weights array is formatted so that the rows 
contain weights related to each input (weights for all neurons for the given input), we can multiply 
them by the gradient vector directly:

In [16]:
# Sum weights related to the given input multiplied by
# the gradient related to the given neuron

dx0 = sum(weights[0] * dvalues[0])
dx1 = sum(weights[1] * dvalues[0])
dx2 = sum(weights[2] * dvalues[0])
dx3 = sum(weights[3] * dvalues[0])

dinputs = np.array([dx0, dx1, dx2, dx3])
print(dinputs)

[ 0.44 -0.38 -0.07  1.37]


The dot product takes 
rows from the first argument and columns from the second to perform multiplication and sum; 
thus, we need to transpose the weights for this calculation:

In [17]:
# Sum weights related to the given input multiplied by
# the gradient related to the given neuron
dinputs = np.dot(dvalues[0], weights.T)

print(dinputs)

[ 0.44 -0.38 -0.07  1.37]


We have to account for one more thing — a batch of samples. So far, we have been using a single 
sample responsible for a single gradient vector that is backpropagated between layers. The row 
vector that we created for dvalues is in preparation for a batch of data. With more samples, 
the layer will return a list of gradients, which we almost handle correctly for. 

In [18]:
# Passed-in gradient from the next layer
# for the purpose of this example we're going to use
# an array of an incremental gradient values
dvalues = np.array([[1., 1., 1.],
                     [2., 2., 2.],
                     [3., 3., 3.]])

# We have 3 sets of weights - one set for each neuron
# we have 4 inputs, thus 4 weights
# recall that we keep weights transposed
weights = np.array([[0.2, 0.8, -0.5, 1],
                     [0.5, -0.91, 0.26, -0.5],
                     [-0.26, -0.27, 0.17, 0.87]]).T

# sum weights of given input
# and multiply by the passed-in gradient for this neuron
dinputs = np.dot(dvalues, weights.T)
print(dinputs)

[[ 0.44 -0.38 -0.07  1.37]
 [ 0.88 -0.76 -0.14  2.74]
 [ 1.32 -1.14 -0.21  4.11]]


Calculating the gradients with respect to weights

In [19]:
# Passed-in gradient from the next layer
# for the purpose of this example we're going to use
# an array of an incremental gradient values

dvalues = np.array([[1., 1., 1.],
                    [2., 2., 2.],
                    [3., 3., 3.]])

# We have 3 sets of inputs - samples
inputs = np.array([[1, 2, 3, 2.5],
                    [2., 5., -1., 2],
                    [-1.5, 2.7, 3.3, -0.8]])

dweights = np.dot(inputs.T, dvalues)
print(dweights)

[[ 0.5  0.5  0.5]
 [20.1 20.1 20.1]
 [10.9 10.9 10.9]
 [ 4.1  4.1  4.1]]


For the biases and derivatives with respect to them

In [20]:
# Passed-in gradient from the next layer
# for the purpose of this example we're going to use
# an array of an incremental gradient values
dvalues = np.array([[1., 1., 1.],
                    [2., 2., 2.],
                    [3., 3., 3.]])

# One bias for each neuron
# biases are the row vector with a shape (1, neurons)
biases = np.array([[2, 3, 0.5]])

# dbiases - sum values, do this over samples (first axis), keepdims
# since this by default will produce a plain list -
# we explained this in the chapter 4
dbiases = np.sum(dvalues, axis=0, keepdims=True)
print(dbiases)

[[6. 6. 6.]]


The derivative of the ReLU function

In [21]:
# Example layer output
z = np.array([[1, 2, -3, -4],
              [2, -7, -1, 3],
              [-1, 2, 5, -1]])

dvalues = np.array([[1, 2, 3, 4],
                    [5, 6, 7, 8],
                    [9, 10, 11, 12]])

# ReLU activation derivative
drelu = np.zeros_like(z)
drelu[z > 0] = 1
print(drelu)

# The chain rule
drelu *= dvalues
print(drelu)

[[1 1 0 0]
 [1 0 0 1]
 [0 1 1 0]]
[[ 1  2  0  0]
 [ 5  0  0  8]
 [ 0 10 11  0]]


Simplified version

In [22]:
# ReLU activation's derivative
# with the chain rule applied
drelu = dvalues.copy()
drelu[z <= 0] = 0
print(drelu)

[[ 1  2  0  0]
 [ 5  0  0  8]
 [ 0 10 11  0]]


Let’s combine the forward and backward pass of a single neuron with a full layer and batch-based 
partial derivatives. We’ll minimize ReLU’s output, once again, only for this example.

In [23]:
dvalues = np.array([[1., 1., 1.],
 [2., 2., 2.],
 [3., 3., 3.]])

inputs = np.array([[1, 2, 3, 2.5],
 [2., 5., -1., 2],
 [-1.5, 2.7, 3.3, -0.8]])

weights = np.array([[0.2, 0.8, -0.5, 1],
 [0.5, -0.91, 0.26, -0.5],
 [-0.26, -0.27, 0.17, 0.87]]).T

biases = np.array([[2, 3, 0.5]])

# Forward pass
layer_outputs = np.dot(inputs, weights) + biases # Dense layer
relu_outputs = np.maximum(0, layer_outputs) # ReLU activation

# Let's optimize and test backpropagation here
# ReLU activation - simulates derivative with respect to input values
# from next layer passed to current layer during backpropagation
drelu = relu_outputs.copy()
drelu[layer_outputs <= 0] = 0

# Dense layer
# dinputs - multiply by weights
dinputs = np.dot(drelu, weights.T)
# dweights - multiply by inputs
dweights = np.dot(inputs.T, drelu)
# dbiases - sum values, do this over samples (first axis), keepdims
# since this by default will produce a plain list -
# we explained this in the chapter 4
dbiases = np.sum(drelu, axis=0, keepdims=True)

# Update parameters
weights += -0.001 * dweights
biases += -0.001 * dbiases

print(weights)
print(biases)

[[ 0.179515   0.5003665 -0.262746 ]
 [ 0.742093  -0.9152577 -0.2758402]
 [-0.510153   0.2529017  0.1629592]
 [ 0.971328  -0.5021842  0.8636583]]
[[1.98489  2.997739 0.497389]]


During the forward method for our Layer_Dense class, we will want to remember what the 
inputs were (recall that we’ll need them when calculating the partial derivative with respect to 
weights during backpropagation), which can be easily implemented using an object property 
(self.inputs):

In [24]:
# Dense layer
class Layer_Dense:
     ...
     # Forward pass
     def forward(self, inputs):
         ...
         self.inputs = inputs
        

Next, we will add our backward pass (backpropagation) code that we developed previously into a 
new method in the layer class, which we’ll call backward:

#### Dense Layer Derivative and Backward  pass

In [25]:
class Layer_Dense:
    ...
    # Backward pass
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.bias = np.sum(dvalues, axis = 0, keepdims = True)
        
        # Gradients on values
        self.dinputs = np.dot(dvalues, self.weights.T)

We then do the same for our ReLU class


#### ReLU Activation Derivative and Backward pass

In [26]:
class Activation_ReLU:
    
    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.max(0,inputs)
    
    # Backward pass
    def backward(self, dvalues):
        # Since we need to modify the original variable,
        # let's make a copy of the values first
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0

#### Categorical Cross-Entropy Loss derivative and backward pass

In [45]:
# Cross-entropy loss
class Loss_CategoricalCrossentropy:
    ...
    # Backward pass
    def backward(self, dvalues, y_true):
        
        # Number of samples
        samples = len(dvalues)
        
        # Number of labels in every samples
        # We will use the first sample to count them
        labels = len(dvalues[0])
        
        # If labels are sparse, turn them into one-hot vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]
            
        # Calculate gradient
        self.dinputs = - y_true / dvalues
        
        # Normalize gradient
        self.dinputs = self.dinputs / samples

#### np.eye

In [28]:
print(np.eye(5))

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]


In [29]:
print(np.eye(5)[1])

[0. 1. 0. 0. 0.]


In [30]:
print(np.eye(5)[4])

[0. 0. 0. 0. 1.]


#### Softmax activation derivative code implementation

First part

In [31]:
softmax_output = [0.7, 0.1, 0.2]

shape it as a list of samples:

In [32]:
softmax_output = np.array(softmax_output).reshape(-1, 1)
print(softmax_output)

[[0.7]
 [0.1]
 [0.2]]


 Softmax’s output multiplied by the Kronecker delta.
 
The Kronecker 
delta equals 1 when both inputs are equal, and 0 otherwise. If we visualize this as an array, we’ll 
have an array of zeros with ones on the diagonal — you might remember that we already have 
implemented such a solution using the np.eye method:

In [33]:
print(np.eye(softmax_output.shape[0]))

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


Now we’ll do the multiplication of both of the values

In [34]:
print(softmax_output * np.eye(softmax_output.shape[0]))

[[0.7 0.  0. ]
 [0.  0.1 0. ]
 [0.  0.  0.2]]


It turns out that we can gain some speed by replacing this by the np.diagflat method call, 
which computes the same solution — the diagflat method creates an array using an input vector as 
the diagonal:

In [35]:
print(np.diagflat(softmax_output))

[[0.7 0.  0. ]
 [0.  0.1 0. ]
 [0.  0.  0.2]]


Second Part

In [36]:
print(np.dot(softmax_output, softmax_output.T))

[[0.49 0.07 0.14]
 [0.07 0.01 0.02]
 [0.14 0.02 0.04]]


Finally

In [37]:
print(np.diagflat(softmax_output) -
 np.dot(softmax_output, softmax_output.T))

[[ 0.21 -0.07 -0.14]
 [-0.07  0.09 -0.02]
 [-0.14 -0.02  0.16]]


The matrix result of the equation and the array solution provided by the code is called the 
Jacobian matrix. In our case, the Jacobian matrix is an array of partial derivatives in all of the 
combinations of both input vectors. Remember, we are calculating the partial derivatives of every 
output of the Softmax function with respect to each input separately. We do this because each 
input influences each output due to the normalization process, which takes the sum of all the 
exponentiated inputs. The result of this operation, performed on a batch of samples, is a list of the 
Jacobian matrices, which effectively forms a 3D matrix

This raises a question — if sample-wise gradients are the Jacobian matrices, how do we perform 
the chain rule with the gradient back-propagated from the loss function, since it’s a vector for each 
sample? Also, what do we do with the fact that the previous layer, which is the Dense layer, will 
expect the gradients to be a 2D array?

We can perform this operation on each of the Jacobian matrices directly, 
applying the chain rule at the same time (applying the gradient from the loss function) using 
np.dot() — For each sample, it’ll take the row from the Jacobian matrix and multiply it by the 
corresponding value from the loss function’s gradient. As a result, the dot product of each of these 
vectors and values will return a singular value, forming a vector of the partial derivatives samplewise and a 2D array (a list of the resulting vectors) batch-wise.

In [49]:
class Activation_Softmax:
    ...
    
    # Backward pass
    def backward(self, dvalues):
        
        
        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)
        
        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1,1)
            # Calculate Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            
            
            # Calculate sample-wise gradient and add it to the array of sample gradient
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)
            

#### Common Categorical Cross-Entropy loss and Softmax activation derivative - code implementation

In [39]:
# Softmax classifier - combined Softmax activation and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy():
    
    # create activation and loss function objects
    def __int__(self):
        self.activation = Activation_Softamx()
        self.loss = Loss_CategoricalCrossentropy()
    
    # Forward pass
    def forward(self, inputs, y_true):
        # Output layer's activation function
        self.activation.forward(inputs)
        # Set the output
        self.output = self.activation.output
        # Calculate the loss and return
        return self.loss.forward(self.output, y_true)
    
    # Backward pass
    def backward(self, dvalues, y_true):
        
        # Number of samples
        samples = len(dvalues)
        
        # If labels are one-hot encoded, turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis = 1)
            
        # Copy so we can easily modify
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples
    

We can now test if the combined backward step returns the same values compared to when we 
backpropagate gradients through both of the functions separately. For this example, let’s make up 
an output of the Softmax function and some target values. Next, let’s backpropagate them using 
both solutions:

In [55]:
softmax_outputs = np.array([[0.7, 0.1, 0.2],
                            [0.1, 0.5, 0.4],
                            [0.02, 0.9, 0.08]])

class_targets = np.array([0, 1, 1])

softmax_loss = Activation_Softmax_Loss_CategoricalCrossentropy()
softmax_loss.backward(softmax_outputs, class_targets)
dvalues1 = softmax_loss.dinputs

activation = Activation_Softmax()
activation.output = softmax_outputs
loss = Loss_CategoricalCrossentropy()
loss.backward(activation.output, class_targets)
activation.backward(loss.dinputs)
dvalues2 = activation.dinputs

print('Gradients: Combined loss and activation:')
print(dvalues1)
print('Gradients: Separate loss and activation:')
print(dvalues2)

Gradients: Combined loss and activation:
[[-0.1         0.03333333  0.06666667]
 [ 0.03333333 -0.16666667  0.13333333]
 [ 0.00666667 -0.03333333  0.02666667]]
Gradients: Separate loss and activation:
[[-0.1         0.03333333  0.06666667]
 [ 0.03333333 -0.16666667  0.13333333]
 [ 0.00666667 -0.03333333  0.02666667]]


To answer the question of how 
many times faster this solution is, we can take advantage of Python’s timeit module, running 
both solutions multiple times and combining the execution times.

In [60]:
from timeit import timeit

softmax_outputs = np.array([[0.7, 0.1, 0.2],
                            [0.1, 0.5, 0.4],
                            [0.02, 0.9, 0.08]])

class_targets = np.array([0, 1, 1])

def f1():
    softmax_loss = Activation_Softmax_Loss_CategoricalCrossentropy()
    softmax_loss.backward(softmax_outputs, class_targets)
    dvalues1 = softmax_loss.dinputs
    
def f2():
    activation = Activation_Softmax()
    activation.output = softmax_outputs
    loss = Loss_CategoricalCrossentropy()
    loss.backward(softmax_outputs, class_targets)
    activation.backward(loss.dinputs)
    dvalues2 = activation.dinputs

t1 = timeit(lambda: f1(), number=10000)
t2 = timeit(lambda: f2(), number=10000)
print(f'Calculating the gradients separately is about {t2/t1} times slower')

Calculating the gradients separately is about 2.857090630239817 times slower


### Full Code upto this point

In [10]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data
nnfs.init()

In [11]:
class Layer_Dense:
    
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases
    
    def backward(self, dvalues):
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis = 0, keepdims = True)
        
        self.dinputs = np.dot(dvalues, self.weights.T)

In [21]:
class Activation_ReLU:
    
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)
    
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0

In [13]:
class Activation_Softmax:
    
    def forward(self, inputs):
        self.inputs = inputs
        exp_values = np.exp(inputs - np.max(inputs, axis = 1, keepdims = True))
        probabilities = exp_values / np.sum(exp_values, axis = 1, keepdims = True)
        self.output = probabilities
    
    def backward(self, dvalues):
        self.dinputs = np.empty_like(dvalues)
        
        for index, (single_output, single_davlues) in enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1,1)
            
            # Calculate Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            
            # Calculate Sample-wise gradient
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

In [14]:
class Loss:
    
    def calculate(self,output, y):
        sample_losses = self.forward(output,y)
        data_loss = np.mean(sample_losses)
        return data_loss

In [22]:
class Loss_CategoricalCrossentropy(Loss):
    
    def forward(self, y_pred, y_true):
        samples = len(y_pred)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        
        # Probabilities for target classes
        # If sparse labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
                range(samples), y_true
            ]
        # If one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis = 1)
        # Losses
        negative_likelihoods = -np.log(correct_confidences)
        return negative_likelihoods
    
    def backward(self, dvalues, y_true):
        samples = len(dvalues)
        labels = len(dvalues[0])
        
        # If labels are sparse turn them into one-hot encoded vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]
        
        self.dinputs = -y_true / dvalues
        self.dinputs = self.dinputs / samples

In [16]:
class Activation_Softmax_Loss_CategoricalCrossentropy():
    
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()
        
    def forward(self, inputs, y_true):
        self.activation.forward(inputs)
        self.output = self.activation.output
        return self.loss.calculate(self.output, y_true)
    
    def backward(self, dvalues, y_true):
        samples = len(dvalues)
        
        # If labels are one-hot encoded turn then into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis = 1)
        
        self.dinputs = dvalues
        self.dinputs[range(samples), y_true] -= 1
        self.dinputs = self.dinputs / samples

### Create  Dataset

In [17]:
X, y = spiral_data(samples=100, classes=3)

In [20]:
dense1 = Layer_Dense(2,3)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(3,3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# Forward pass
dense1.forward(X)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
loss = loss_activation.forward(dense2.output,y)

print(loss_activation.output[:5])
print('loss: ',loss)

predictions = np.argmax(loss_activation.output, axis = 1)
if len(y.shape) == 2:
    y = np.argmax(y, axis = 1)
accuracy = np.mean(predictions == y)
print('acc: ', accuracy)

# Backward pass
loss_activation.backward(loss_activation.output, y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

print(dense1.dweights)
print(dense1.dbiases)
print(dense2.dweights)
print(dense2.dbiases)

epoch: 0acc: 0.297loss: 1.099
epoch: 100acc: 0.333loss: 1.099
epoch: 200acc: 0.333loss: 1.099
epoch: 300acc: 0.333loss: 1.099
epoch: 400acc: 0.333loss: 1.099
epoch: 500acc: 0.333loss: 1.099
epoch: 600acc: 0.333loss: 1.099
epoch: 700acc: 0.333loss: 1.099
epoch: 800acc: 0.333loss: 1.099
epoch: 900acc: 0.333loss: 1.099
epoch: 1000acc: 0.333loss: 1.099
epoch: 1100acc: 0.333loss: 1.099
epoch: 1200acc: 0.333loss: 1.099
epoch: 1300acc: 0.333loss: 1.099
epoch: 1400acc: 0.333loss: 1.099
epoch: 1500acc: 0.333loss: 1.099
epoch: 1600acc: 0.333loss: 1.099
epoch: 1700acc: 0.333loss: 1.099
epoch: 1800acc: 0.333loss: 1.099
epoch: 1900acc: 0.333loss: 1.099
epoch: 2000acc: 0.333loss: 1.099
epoch: 2100acc: 0.333loss: 1.099
epoch: 2200acc: 0.333loss: 1.099
epoch: 2300acc: 0.333loss: 1.099
epoch: 2400acc: 0.333loss: 1.099
epoch: 2500acc: 0.333loss: 1.099
epoch: 2600acc: 0.333loss: 1.099
epoch: 2700acc: 0.333loss: 1.099
epoch: 2800acc: 0.333loss: 1.099
epoch: 2900acc: 0.333loss: 1.099
epoch: 3000acc: 0.333l