This function will take a list of dictionaries and convert that structure into an initial neural network with He weight initialization.

In [23]:
import numpy as np

def init_layers(nn_architecture, seed = 42):
    np.random.seed(seed)
    params_values = {}

    # Take the list of dictionaries and construct a NN with randomly assigned weights and bias vectors.
    # Layers are given small random numbers as their inital state so as not to encounter the breaking symmetry problem if all given the same values and it provides a good enough starting point.
    # Using small values increases the efficiency of the algorithm during the first iterations.
    for i in range(1, len(nn_architecture)):
        layer_input_size = nn_architecture[i-1]["input_nodes"]

        if (i == len(nn_architecture)-1):
            layer_output_size = 10
        else:
            layer_output_size = nn_architecture[i]["input_nodes"]
        
        # Initialize layer node's weights using He Weight Initialization. This is suitable as we are using ReLU activation functions.
        # Source: https://datascience-enthusiast.com/DL/Improving-DeepNeural-Networks-Initialization.html
        # This stage is important as incorrect weight initialization can lead to vanishing/exploding gradients.
        params_values['W' + str(i)] = np.random.randn(
            layer_output_size, layer_input_size) * np.sqrt(2.0 / layer_input_size)
        
        
    return params_values

Sigmoid, ReLU, and Softmax activation functions for both forward popagation and backward (deriative) propagation

In [24]:
# Activation Function - Sigmoid - Forward Propagation
def sigmoid(Z):
    return 1.0/(1.0+np.exp(-Z))

# Activation Function - ReLU - Forward Propagation
def relu(Z):
    return np.maximum(0,Z)

# Activation Function - Softmax - Forward Propagation - Avoids overflow or underflow due to floating point instability.
def softmax(Z):
    e = np.exp(Z - np.max(Z))
    return e / np.sum(e, axis = 0)


# Activation Function - Softmax - Backward Propagation - Avoids overflow or underflow due to floating point instability.
def softmax_backward(Z):
    smax = softmax(Z)
    return smax * (1 - smax)

# Activation Function - Sigmoid - Backward Propagation
def sigmoid_backward(Z):
    sig = sigmoid(Z)
    # return (np.exp(-Z))/((np.exp(-Z)+1)**2)
    return sig * (1-sig)

# Activation Function - ReLU - Backward Propagation
def relu_backward(Z):
    dZ = np.array(Z, copy = True)
    dZ[Z <= 0] = 0
    dZ[Z > 0] = 1
    return dZ

Forward Propagation is split into two functions, single layer step-forward and entire NN step forward.


In [25]:
def single_layer_forward_propagation(A_prev, W_curr, activation="relu"):

    if activation == "relu":
        activation_func = relu
    elif activation == "sigmoid":
        activation_func = sigmoid
    elif activation == "softmax":
        activation_func = softmax
    else:
        raise Exception('Non-supported activation function')

    # Compute error.
    Z_curr = np.dot(W_curr, A_prev)
        
    return activation_func(Z_curr), Z_curr

# X - Input Matrix
# Will perform a full forward step propagation and organize all intermdeite values returned from each step forward of a layer.
def full_forward_propagation(X, params_values, nn_architecture):
    memory = {}
    A_curr = X
    memory["A0"] = A_curr
    
    for idx in range(len(nn_architecture)):
        # Don't evaluate the output layer
        if (idx == len(nn_architecture)-1): continue
        layer_idx = idx + 1
        A_prev = A_curr
        
        # Get current layer's activation function
        activ_function_curr = nn_architecture[layer_idx]["activation"]
        W_curr = params_values["W" + str(layer_idx)]
        A_curr, Z_curr = single_layer_forward_propagation(A_prev, W_curr, activ_function_curr)
        
        memory["A" + str(layer_idx)] = A_curr
        memory["Z" + str(layer_idx)] = Z_curr
    
    # Return A value of the output layer and computed values of all layers.
    return A_curr, memory

Backward Propagation is split into two functions, single layer step-backward and entire NN step backward.

In [26]:
def single_layer_backward_propagation(y, W_curr, Z_curr, A_prev, Y, is_output_layer, error, activation="relu"):
    if activation == "relu":
        backward_activation_func = relu_backward
    elif activation == "sigmoid":
        backward_activation_func = sigmoid_backward
    elif activation == "softmax":
        backward_activation_func = softmax_backward
    else:
        raise Exception('Non-supported activation function')
    
    dZ_curr = backward_activation_func(Z_curr)
    error_cal = None

    # Error of the output layer
    if is_output_layer:
        error_cal = 2 * (y - Y) / y.shape[0] * dZ_curr
    # Error of the hidden layers
    else:
        error_cal = np.dot(W_curr.T, error) * dZ_curr
    
    dW_curr = np.dot(error_cal, A_prev.T)
    return dW_curr, error_cal

def full_backward_propagation(Y_hat, Y, memory, params_values, nn_architecture):
    grads_values = {}
    error = 0
    
    for layer_idx_curr, layer in reversed(list(enumerate(nn_architecture))):
        # Don't calculate the weight's gradient values for the input layer.
        if (layer_idx_curr == 0): continue
        
        layer_idx_prev = layer_idx_curr - 1
        activ_function_curr = layer["activation"]
        
        A_prev = memory["A" + str(layer_idx_prev)]
        Z_curr = memory["Z" + str(layer_idx_curr)]
        W_curr = params_values["W" + str(layer_idx_curr+1)] if layer_idx_curr != len(nn_architecture)-1 else None
        
        is_output_layer = True if layer_idx_curr == len(nn_architecture)-1 else False
        dW_curr, error = single_layer_backward_propagation(
            Y_hat, W_curr, Z_curr, A_prev, Y, is_output_layer, error, activ_function_curr)
        
        grads_values["dW" + str(layer_idx_curr)] = dW_curr
    
    return grads_values

Stochastic Gradient Descent

In [27]:
# Gradient descent optimization.
def update(params_values, grads_values, nn_architecture, learning_rate):
    for layer_idx, layer in enumerate(nn_architecture):
        if (layer_idx == len(nn_architecture)-1): continue
        params_values["W" + str(layer_idx+1)] -= learning_rate * grads_values["dW" + str(layer_idx+1)]

    return params_values

For a dataset, which is a multi-class clasification problem, it is best suited to use a softmax activation function for the output layer as well as cross entropy to compute the loss value for a particular predicted value.

In [28]:
# Loss Function - Calculates categorical cross entropy
# Cross entropy will provide a score for each value passed in. This score represents the distance from the actual value/label.
# As this is logarithmic, small differences are given small scores and large differences are given enormous scores.
# These scores are then used to penalize the probability.

# An alternative, more efficeint function of this idea is scikit-learn's log_loss() function.
def get_metrics(X_test, y_test, params_values, nn_architecture):
    predictions_accuracy = np.array([])
    predictions_cost = np.array([])
    
    for X, y in zip(X_test, y_test):
        output, memory = full_forward_propagation(X, params_values, nn_architecture)
        predictions_accuracy = np.append(predictions_accuracy, np.argmax(output) == np.argmax(y))

        for idx, val in enumerate(y):
            # A very small value of 1e-15 is added to the predicted probability to prevent ever calculating the log of 0. 
            predictions_cost = np.append(predictions_cost, val * np.log(1e-15 + output[idx]))
            
    return np.mean(predictions_accuracy), -np.sum(predictions_cost) / y_test.shape[0]

This will take the target and features and create a mini-batch of the total data based on batch_size.

In [None]:
# Taken from this post: https://stackoverflow.com/a/54647545/10439539
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert inputs.shape[0] == targets.shape[0]
    if shuffle:
        indices = np.arange(inputs.shape[0])
        np.random.shuffle(indices)
    for start_idx in range(0, inputs.shape[0], batchsize):
        end_idx = min(start_idx + batchsize, inputs.shape[0])
        if shuffle:
            excerpt = indices[start_idx:end_idx]
        else:
            excerpt = slice(start_idx, end_idx)
        yield inputs[excerpt], targets[excerpt]

Function to train the NN using the forward and backward propagation functions.

In [None]:
import time
def train(X_train, y_train, X_test, y_test, nn_architecture, batch_size, epochs, learning_rate, seed):
    # Create neural network based on specified architecture with initial weights.
    params_values = init_layers(nn_architecture, seed)
    metrics = [[], []]
    
    # Timer for time metrics.
    start_time = time.time()
    for i in range(epochs):
        for batch in iterate_minibatches(X_train, y_train, batch_size, seed, shuffle=True):
            X_batch, y_batch = batch
            Y_hat, memory = full_forward_propagation(X_batch.T, params_values, nn_architecture)
            
            grads_values = full_backward_propagation(Y_hat, y_batch.T, memory, params_values, nn_architecture)
            params_values = update(params_values, grads_values, nn_architecture, learning_rate)

        accuracy, loss = get_metrics(X_test, y_test, params_values, nn_architecture)
        metrics[0].append(accuracy)
        metrics[1].append(loss)

        print('Epoch: {0}, Total Time Spent: {1:.2f}s, Accuracy: {2:.2f}%, Loss: {3:.3f}'.format(
                i+1, time.time() - start_time, accuracy * 100, loss
            ))
        
    return params_values, metrics

In [29]:
# Setup the initialization array which outlines the architecture of the NN and will be passed into the program to generate the appropraite NN.
nn_architecture = [
    {"input_nodes": 784, "activation": "relu"},
    {"input_nodes": 128, "activation": "relu"},
    {"input_nodes": 64, "activation": "relu"},
    {"input_nodes": 10, "activation": "softmax"},
]

import pandas as pd

df_test = pd.read_csv('./Dataset/fashion-mnist_test.csv')
df_train = pd.read_csv('./Dataset/fashion-mnist_train.csv')

y_train = np.array(df_train['label'].to_numpy())
df_train = df_train.drop('label', 1)

# Normalziing the pixel data
X_train = np.array((df_train.to_numpy() / 255).astype('float32'))

y_test = np.array(df_test['label'].to_numpy())
df_test = df_test.drop('label', 1)

# Normalziing the pixel data
X_test = np.array((df_test.to_numpy() / 255).astype('float32'))

# One Hot Encoding the labels
y_train = np.eye(10)[y_train]
y_test = np.eye(10)[y_test]

params_values, metrics_history = train(X_train, y_train, X_test, y_test, nn_architecture, 128, 10, 0.005, 42)

  df_train = df_train.drop('label', 1)


{'W1': array([[ 0.02508785, -0.0069834 ,  0.03271321, ..., -0.06744508,
         0.01920289,  0.03083924],
       [ 0.02827369,  0.05458767,  0.04211943, ..., -0.01420919,
         0.00338354,  0.02605887],
       [-0.07892048, -0.0267212 ,  0.04011642, ..., -0.01979963,
         0.05353487,  0.03116351],
       ...,
       [-0.01520984, -0.02996139,  0.07425277, ...,  0.08622519,
        -0.0071112 ,  0.04780936],
       [ 0.04152834,  0.03256375, -0.01150627, ..., -0.08783793,
        -0.01636675, -0.0307535 ],
       [ 0.08485768, -0.00966599, -0.03247982, ...,  0.06209712,
         0.08136642,  0.07013439]]), 'W2': array([[-0.01002943,  0.095423  ,  0.13284665, ...,  0.13962699,
         0.12200584,  0.08994849],
       [-0.04016608, -0.00100266, -0.15665874, ..., -0.08438804,
        -0.06913039, -0.23780935],
       [-0.01403963, -0.08706934, -0.08593643, ..., -0.07263895,
        -0.04598236,  0.10144271],
       ...,
       [-0.15393538,  0.04641696, -0.28507112, ...,  0.000537

  df_test = df_test.drop('label', 1)
