In [1]:
import matplotlib.pyplot as plt
import numpy as np
import nnfs
from nnfs.datasets import spiral_data, sine_data

from zipfile import ZipFile
import os
import urllib
import urllib.request
import cv2

import pickle
import copy

In [2]:
URL = 'https://nnfs.io/datasets/fashion_mnist_images.zip'
FILE = 'fashion_mnist_images.zip'
FOLDER = 'fashion_mnist_images'

if not os.path.isfile(FILE):
    print(f'Downloading {URL} and saving as {FILE}...')
    urllib.request.urlretrieve(URL, FILE)

print('Unzipping images...')
with ZipFile(FILE) as zip_images:
    zip_images.extractall(FOLDER)

print('DONE!')

Unzipping images...
DONE!


In [3]:
nnfs.init()

In [4]:
# Dense Layer
class Layer_Dense:

    # Layer initialisation
    def __init__(self, n_inputs, n_neurons, weight_regulariser_l1=0, weight_regulariser_l2=0, bias_regulariser_l1=0, bias_regulariser_l2=0):
        # Initialise the weights and the biases
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        # Set regularisation strength
        self.weight_regulariser_l1 = weight_regulariser_l1
        self.weight_regulariser_l2 = weight_regulariser_l2
        self.bias_regulariser_l1 = bias_regulariser_l1
        self.bias_regulariser_l2 = bias_regulariser_l2

    # Forward pass
    def forward(self, inputs, training):
        # Remember input values
        self.inputs = inputs
        # Calculate output values from inputs, weights, and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backpropagation
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)

        # Gradients on regularisation, L1 on weights
        if self.weight_regulariser_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights < 0] = -1
            self.dweights += self.weight_regulariser_l1 * dL1
        # L2 on weights
        if self.weight_regulariser_l2 > 0:
            self.dweights += 2 * self.weight_regulariser_l2 * self.weights

        # L1 on biases
        if self.bias_regulariser_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases < 0] = -1
            self.dbiases += self.bias_regulariser_l1 * dL1
        # L2 on biases
        if self.bias_regulariser_l2 > 0:
            self.dbiases += 2 * self.bias_regulariser_l2 * self.biases

        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

    # Retrieve layer parameters
    def get_parameters(self):
        return self.weights, self.biases

    # Set weights and biases in the layer instance
    def set_parameters(self, weights, biases):
        self.weights = weights
        self.biases = biases

In [5]:
# Dropout
class Layer_Dropout:

    # Init
    def __init__(self, rate):
        # Store rate, we invert it as for example for dropout of 0.1 we need a success rate of 0.9
        self.rate = 1 - rate

    # Forward pass
    def forward(self, inputs, training):
        # Save input values
        self.inputs = inputs

        # If not in the training mode - return values
        if not training:
            self.output = inputs.copy()
            return
            
        # Generate and save scaled mask
        self.binary_mask = np.random.binomial(1, self.rate, size=inputs.shape) / self.rate
        #Apply mask to output values
        self.output = inputs * self.binary_mask

    # Backwards pass
    def backward(self, dvalues):
        # Gradient on values
        self.dinputs = dvalues * self.binary_mask

In [6]:
# Input "layer"
class Layer_Input:

    # Forward pass
    def forward(self, inputs, training):
        self.output = inputs

In [7]:
# ReLU activation
class Activation_ReLU:

    # Forward pass
    def forward(self, inputs, training):
        # Remember inputs
        self.inputs = inputs
        # Calculate output values from inputs
        self.output = np.maximum(0, inputs)

    # Backpropagation
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()

        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

    # Calculate predictions for outputs
    def predictions(self, outputs):
        return outputs

In [8]:
# Softmax activation
class Activation_Softmax:

    # Forward pass
    def forward(self, inputs, training):
        # Remember input values
        self.inputs = inputs

        # Get unnormalised probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))

        # Normalise them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)

        self.output = probabilities

    # Backpropagation
    def backward(self, dvalues):
        # Create an uninitialised array
        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1, 1)
            # Calculate the Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            # Then calculate the sample-wise gradient and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

    # Calculate predictions for outputs
    def predictions(self, outputs):
        return np.argmax(outputs, axis=1)

In [9]:
# Sigmoid Activation
class Activation_Sigmoid:

    # Forward pass
    def forward(self, inputs, training):
        # Save inputs and calculate outputs
        self.inputs = inputs
        self.output = 1 / (1 + np.exp(-inputs))

    # Backward pass
    def backward(self, dvalues):
        # Derivative - calculates from the output of the sigmoid function
        self.dinputs = dvalues * (1 - self.output) * self.output

    # Calculate predictions for outputs
    def predictions(self, outputs):
        return (outputs > 0.5) * 1

In [10]:
# Linear activation
class Activation_Linear:

    # Forward pass
    def forward(self, inputs, training):
        self.inputs = inputs
        self.output = inputs

    # Backward pass
    def backward(self, dvalues):
        # derivative is 1, 1 * dvalues = dvalues
        self.dinputs = dvalues.copy()

    # Calculate predictions for outputs
    def predictions(self, outputs):
        return outputs

In [11]:
# Stochastic Gradient Optimiser
class Optimiser_SGD:

    # Initialise parameters - set settings, learning_rate=1. is default for this optimiser
    def __init__(self, learning_rate=1.0, decay=0., momentum=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):
        
        # If we use momentum
        if self.momentum:

            # If the layer does not contain momentum arrays, create them filled with zeros
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                # If there is no momentum array for weights, then the array doesn't exist for biases either
                layer.bias_momentums = np.zeros_like(layer.biases)

            # Build weight updates with momentum - take previous updates multiplied by retain factor and update with current gradients
            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates

            # Build bias updates
            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates

        # Original SGD without momentum
        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases

        # Update weights and biases using either original or momentum updates
        layer.weights += weight_updates
        layer.biases += bias_updates

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

In [12]:
class Optimiser_Adagrad:

    # Initialise optimiser - set settings
    def __init__(self, learning_rate=1., decay=0.,  epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):

        # If layer does not contain cache arrays, create them filled with zeros
        if not hasatrr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update cache with squared current gradients
        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dbiases ** 2

        # Vanilla SGD parameter update + normalisation with square rooted cache
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1

In [13]:
# RMSprop optimiser
class Optimiser_RMSprop:

    # Initialise optimiser - set settings
    def __init__(self, learning_rate=0.001, decay=0.,  epsilon=1e-7, rho=0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):

        # If layer does not contain cache arrays, create them filled with zeros
        if not hasatrr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update cache with squared current gradients
        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights**2
        layer.bias_cache = self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases**2

        # Vanilla SGD parameter update + normalisation with square rooted cache
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1

In [14]:
class Optimiser_Adam:

    # Initialise optimiser - set settings
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    # Call once before any paramter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update params
    def update_params(self, layer):

        # If the layer does not contain cache arrays, create them filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update momentum with current gradients
        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases

        # Get corrected momentum, self.iteration is 0 at first pass and we need it to start with 1
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        # Update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2

        # Get corrected cache
        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))

        # OG SGD parameter update + normalisation with square rooted cache
        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

In [15]:
# Common loss class
class Loss:

    # Regularisation loss calculation
    def regularisation_loss(self):

        # 0 by default
        regularisation_loss = 0

        # Calculate regularisation loss iterated over all trainable layers
        for layer in self.trainable_layers:

            # L1 regularisation - abs(weights)
            # calculate only when factor is greater than 0
            if layer.weight_regulariser_l1 > 0:
                regularisation_loss += layer.weight_regulariser_l1 * np.sum(np.abs(layer.weights))
    
            # L2 regularisaiton - weights ** 2
            if layer.weight_regulariser_l2 > 0:
                regularisation_loss += layer.weight_regulariser_l2 * np.sum(layer.weights * layer.weights)
    
            # L1 regularisation - abs(biases)
            # calculate only when factor is greater than 0
            if layer.bias_regulariser_l1 > 0:
                regularisation_loss += layer.bias_regulariser_l1 * np.sum(np.abs(layer.biases))
    
            # L2 regularisation - biases ** 2
            if layer.bias_regulariser_l2 > 0:
                regularisation_loss += layer.bias_regulariser_l2 * np.sum(layer.biases * layer.biases)

        return regularisation_loss

    # Set/remember trainable layers
    def remember_trainable_layers(self, trainable_layers):
        self.trainable_layers = trainable_layers

    # Calculates the data and regularisation losses given model output and ground truth values
    def calculate(self, output, y, *, include_regularisation=False):

        # Calculate samples losses
        sample_losses = self.forward(output, y)

        # Calculate mean loss
        data_loss = np.mean(sample_losses)

        # Add accumulated sum of losses and sample count
        self.accumulated_sum += np.sum(sample_losses)
        self.accumulated_count += len(sample_losses)

        # If just data loss - return it
        if not include_regularisation:
            return data_loss

        # Return the data and regularisation loss
        return data_loss, self.regularisation_loss()

    # Calculate accumulated loss
    def calculate_accumulated(self, *, include_regularisation=False):
        # Calculate mean loss
        data_loss = self.accumulated_sum / self.accumulated_count

        # If it's just data loss - return it
        if not include_regularisation:
            return data_loss

        # Return
        return data_loss, self.regularisation_loss()

    # Reset variables for accumulated loss
    def new_pass(self):
        self.accumulated_sum = 0
        self.accumulated_count = 0

In [16]:
# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):

    # Forward pass
    def forward(self, y_pred, y_true):

        # Number of samples in a batch
        samples = len(y_pred)

        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values - only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
            range(samples),
            y_true
        ]

        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis=1
            )

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    # Backpropagation
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)

        # Number of labels in every sample, we'll use the first sample to count them
        labels = len(dvalues[0])

        # If labels are sparse, turn them into one-hot vectors
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculate gradient
        self.dinputs = -y_true / dvalues
        # Normalise
        self.dinputs = self.dinputs / samples

In [17]:
# Softmax classifier - combined softmax activation and cross-entropy loss for a faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy():

    # Backpropagation
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)

        # Copy
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # Normalise gradient
        self.dinputs = self.dinputs / samples

In [18]:
# Binary Cross-Entropy Loss
class Loss_BinaryCrossentropy(Loss):

    # Forward pass
    def forward(self, y_pred, y_true):
        # Clip data
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Calculate sample-wise loss
        sample_losses = -(y_true * np.log(y_pred_clipped) + (1 - y_true) * np.log(1 - y_pred_clipped))
        sample_losses = np.mean(sample_losses, axis=-1)

        # Return losses
        return sample_losses

    # Backward pass
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)

        # Number of outputs, used first to count them
        outputs = len(dvalues[0])

        # Clip
        clipped_dvalues = np.clip(dvalues, 1e-7, 1 - 1e-7)

        # Calculate gradient
        self.dinputs = -(y_true / clipped_dvalues - (1 - y_true) / (1 - clipped_dvalues)) / outputs

        # Normalise
        self.dinputs = self.dinputs / samples

In [19]:
# Mean Squared Error Loss
class Loss_MeanSquaredError(Loss):

    # Forward pass
    def forward(self, y_pred, y_true):
        # Calculate loss
        sample_losses = np.mean((y_true - y_pred)**2, axis=-1)

        # Return losses
        return sample_losses

    # Backward pass
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)

        # Number of outputs
        outputs = len(dvalues[0])

        # Gradient on values
        self.dinputs = -2 * (y_true - dvalues) / outputs
        # Normalise
        self.dinputs = self.dinputs / samples

In [20]:
# Mean Absolute Error Loss
class Loss_MeanAbsoluteError(Loss): # L1 loss

    # Forward pass
    def forward(self, y_pred, y_true):
        # Calculate loss
        sample_losses = np.mean(np.abs(y_true - y_pred), axis=-1)

        # Return losses
        return sample_losses

    # Backward pass
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        # Number of outputs in every sample
        outputs = len(dvalues[0])

        # Calculate gradient
        self.dinputs = np.sign(y_true - dvalues) / outputs
        # Normalise
        self.dinputs = self.dinputs / samples

In [21]:
# Common accuracy class
class Accuracy:

    # Calculate an accuracy given predictions and groudn truth values
    def calculate(self, predictions, y):
        # Get comparison results
        comparisons = self.compare(predictions, y)

        # Calculate accuracy
        accuracy = np.mean(comparisons)
        
        # Add accumulated sum of losses and sample count
        self.accumulated_sum += np.sum(comparisons)
        self.accumulated_count += len(comparisons)

        # Return accuracy
        return accuracy

    # Calculates accumulated accuracy
    def calculate_accumulated(self):
        # Calculate an accuracy
        accuracy = self.accumulated_sum / self.accumulated_count

        # Return
        return accuracy

    # Reset variables
    def new_pass(self):
        self.accumulated_sum = 0
        self.accumulated_count = 0

In [22]:
# Accuracy calculation for regression model
class Accuracy_Regression(Accuracy):

    def __init__(self):
        # Create precision property
        self.precision = None

    # Calculate precision value based on passed in ground truth
    def init(self, y, reinit=False):
        if self.precision is None or reinit:
            self.precision = np.std(y) / 250

    # Compares predictions to the ground truth values
    def compare(self, predictions, y):
        return np.absolute(predictions - y) < self.precision

In [23]:
# Accuracy calculation for classification model
class Accuracy_Categorical(Accuracy):

    # No initialisation is needed
    def init(self, y):
        pass

    # Compare predictions to the ground truth values
    def compare(self, predictions, y):
        if len(y.shape) == 2:
            y = np.argmax(y, axis=1)
        return predictions == y

In [31]:
# Model class
class Model:

    def __init__(self):
        # Create a list of network objects
        self.layers = []
        # Softmax classifier's output object
        self.softmax_classifier_output = None

    # Add objects to the model
    def add(self, layer):
        self.layers.append(layer)

    # Set loss and optimiser
    def set(self, *, loss=None, optimiser=None, accuracy=None):
        if loss is not None:
            self.loss = loss
        if optimiser is not None:
            self.optimiser = optimiser
        if accuracy is not None:
            self.accuracy = accuracy

    # Finalise the model
    def finalise(self):
        # Create and set the input layer
        self.input_layer = Layer_Input()

        # Count all the objects
        layer_count = len(self.layers)

        # Initialise a list containing trainable layers
        self.trainable_layers = []

        # Iterate the objects
        for i in range(layer_count):
            # If it's the first layer, the previous layer object is the input layer
            if i == 0:
                self.layers[i].prev = self.input_layer
                self.layers[i].next = self.layers[i+1]

            # All layers except for the first and the last
            elif i < layer_count - 1:
                self.layers[i].prev = self.layers[i-1]
                self.layers[i].next = self.layers[i+1]

            # The last layer, next object is the loss
            else:
                self.layers[i].prev = self.layers[i-1]
                self.layers[i].next = self.loss
                self.output_layer_activation = self.layers[i]

            # If layer contains an attribute called weights, it's a trainable layer
            if hasattr(self.layers[i], 'weights'):
                self.trainable_layers.append(self.layers[i])

            # Update loss object with trainable layers
            self.loss.remember_trainable_layers(
                self.trainable_layers
            )

        # If output activation is Softmax and loss function is categorical cross-entropy, create an object
        if isinstance(self.layers[-1], Activation_Softmax) and isinstance(self.loss, Loss_CategoricalCrossentropy):
            # Create an object of combined activation and loss functions
            self.softmax_classifier_output = Activation_Softmax_Loss_CategoricalCrossentropy()

        # Update loss object with trainable layers
        if self.loss is not None:
            self.loss.remember_trainable_layers(self.trainable_layers)

    # Train the model
    def train(self, X, y, *, epochs=1, batch_size=None, print_every=1, validation_data=None):
        # Initialise accuracy object
        self.accuracy.init(y)

        # Default value if batch size is not set
        train_steps = 1

        # If there is validation data passed
        if validation_data is not None:
            validation_steps = 1

            # For better readability
            X_val, y_val = validation_data

        # Calculate number of steps
        if batch_size is not None:
            train_steps = len(X) // batch_size
            # Fix for rounding down
            if train_steps * batch_size < len(X):
                train_steps += 1

            if validation_data is not None:
                validation_steps = len(X_val) // batch_size
                # Fix for rounding
                if validation_steps * batch_size < len(X_val):
                    validation_steps += 1
        
        # Main training loop
        for epoch in range(1, epochs+1):
            # Print epoch number
            print(f'epoch: {epoch}')

            # Reset accumulated values in loss and accuracy objects
            self.loss.new_pass()
            self.accuracy.new_pass()

            # Iterate over steps
            for step in range(train_steps):
                # If batch size is not set
                if batch_size is None:
                    batch_X = X
                    batch_y = y

                else:
                    batch_X = X[step*batch_size:(step+1)*batch_size]
                    batch_y = y[step*batch_size:(step+1)*batch_size]
            
                # Perform the forward pass
                output = self.forward(batch_X, training=True)

                # Calculate loss
                data_loss, regularisation_loss = self.loss.calculate(output, batch_y, include_regularisation=True)
                loss = data_loss + regularisation_loss

                # Get predictions and calculate accuracy
                predictions = self.output_layer_activation.predictions(output)
                accuracy = self.accuracy.calculate(predictions, batch_y)
    
                # Perform backward pass
                self.backward(output, batch_y)
        
                # Optimise (update params)
                self.optimiser.pre_update_params()
                for layer in self.trainable_layers:
                    self.optimiser.update_params(layer)
                self.optimiser.post_update_params()

                # Print a summary
                if not epoch % print_every or step == train_steps - 1:
                    print ( f'step: {step}, ' +
                            f'acc: {accuracy:.3f}, ' +
                            f'loss: {loss:.3f} (' +
                            f'data_loss: {data_loss:.3f}, ' +
                            f'reg_loss: {regularisation_loss:.3f}), ' +
                            f'lr: {self.optimiser.current_learning_rate}' )

        epoch_data_loss, epoch_regularisation_loss = self.loss.calculate_accumulated(include_regularisation=True)
        epoch_loss = epoch_data_loss + epoch_regularisation_loss
        epoch_accuracy = self.accuracy.calculate_accumulated()

        print(f'training, ' +
              f' acc: {epoch_accuracy:.3f} (' +
              f'data_loss: {epoch_data_loss:.3f}, ' +
              f'reg_loss: {epoch_regularisation_loss:.3f}), ' +
              f'lr: {self.optimiser.current_learning_rate}')

        # If there is validation data
        if validation_data is not None:
            # Evaluate the model
            self.evaluate(*validation_data, batch_size=batch_size) # * unpacks validation_data into singular vals

    # Forward pass
    def forward(self, X, training):
        # Call forward method on the input layer
        self.input_layer.forward(X, training)

        # Call forward method of every object in chain
        for layer in self.layers:
            layer.forward(layer.prev.output, training)

        # Return output
        return layer.output

    # Backwards pass
    def backward(self, output, y):
        # If softmax classifier
        if self.softmax_classifier_output is not None:
            # First call backward method on the combined activation/loss
            self.softmax_classifier_output.backward(output, y)
            self.layers[-1].dinputs = self.softmax_classifier_output.dinputs

            # Call backward method going through all the objects but last in reversed order
            for layer in reversed(self.layers[:-1]):
                layer.backward(layer.next.dinputs)
            return
        
        # First call backwards method on the loss
        self.loss.backward(output, y)

        # Call backward method going through the objects in revers
        for layer in reversed(self.layers):
            layer.backward(layer.next.dinputs)

    # Evaluate the model using passed in dataset
    def evaluate(self, X_val, y_val, *, batch_size=None):
        # Default value if batch size is not being set
        validation_steps = 1

        # Calculate number of steps
        if batch_size is not None:
            validation_steps = len(X_val) // batch_size
            # Fix for rounding down
            if validation_steps * batch_size < len(X_val):
                validation_steps += 1

        # Reset accumulated values in loss and accuracy objects
        self.loss.new_pass()
        self.accuracy.new_pass()

        # Iterate over steps
        for step in range(validation_steps):
            # If batch size is not set
            if batch_size is None:
                batch_X = X_val
                batch_y = y_val
            else:
                batch_X = X_val[step*batch_size:(step+1)*batch_size]
                batch_y = y_val[step*batch_size:(step+1)*batch_size]

            # Perform the forward pass
            output = self.forward(batch_X, training=False)

            # Calculate loss
            loss = self.loss.calculate(output, batch_y)

            # Get predictions and calculate an accuracy
            predictions = self.output_layer_activation.predictions(output)
            accuracy = self.accuracy.calculate(predictions, batch_y)

        # Get and print validation loss and accuracy
        validation_loss = self.loss.calculate_accumulated()
        validation_accuracy = self.accuracy.calculate_accumulated()

        # Print
        print(f'validation, ' +
              f'acc: {validation_accuracy:.3f}, ' +
              f'loss: {validation_loss:.3f}')

    # Retrieves and returns parameters of trainable layers
    def get_parameters(self):
        # Create a list for parameters
        parameters = []

        # Iterable trainable layers and get their parameters
        for layer in self.trainable_layers:
            parameters.append(layer.get_parameters())

        # Return the list
        return parameters

    # Update the model with new parameters
    def set_parameters(self, parameters):
        # Iterate over the parameters and layers and update each with each set of the parameters
        for parameter_set, layer in zip(parameters, self.trainable_layers):
            layer.set_parameters(*parameter_set)

    # Save the parameters to a file
    def save_parameters(self, path):
        # Open a file in the binary-write mode and save parameters to it
        with open(path, 'wb') as f:
            pickle.dump(self.get_parameters(), f)

    # Loads the weights and udpates a model instance with them
    def load_parameters(self, path):
        # Open file in the binary-read mode
        with open(path, 'rb') as f:
            self.set_parameters(pickle.load(f))

    # Save the model
    def save(self, path):
        # Make a deep copy of current model instance
        model = copy.deepcopy(self)

        # Reset accumulated values in loss and accuracy objects
        model.loss.new_pass()
        model.accuracy.new_pass()

        # Remove data from layers and gradients from the loss object
        model.input_layer.__dict__.pop('output', None)
        model.loss.__dict__.pop('dinputs', None)

        # For each layer remove inputs, outputs, and dinput properties
        for layer in model.layers:
            for property in ['inputs', 'output', 'dinputs', 'dweights', 'dbiases']:
                layer.__dict__.pop(property, None)

        # Open a file in the binary-write mode and save the model
        with open(path, 'wb') as f:
            pickle.dump(model, f)

    # Load and return a model
    @staticmethod
    def load(path):
        # Open file in the binary-read mode, load a model
        with open(path, 'rb') as f:
            model = pickle.load(f)

        # Return a model
        return model

    # Predicts on the samples
    def predict(self, X, *, batch_size=None):
        # Default value if batch size is not set
        prediction_steps = 1

        # Calculate number of steps
        if batch_size is not None:
            prediction_steps = len(X) // batch_size
            # Fix for rounding down
            if prediction_steps * batch_size < len(X):
                prediction_steps += 1

        # Model outputs
        output = []

        # Iterate over steps
        for step in range(prediction_steps):
            # If batch size is not set
            if batch_size is None:
                batch_X = X

            # Otherwise slice a batch
            else:
                batch_X = X[step*batch_size:(step+1)*batch_size]

            # Perform the forward pass
            batch_output = self.forward(batch_X, training=False)

            # Append the batch prediction to the list of predictions
            output.append(batch_output)

        # Stack and return results
        return np.vstack(output)

# Loading & Creating MNIST Data

In [25]:
# Loads an MNIST dataset
def load_mnist_dataset(dataset, path: str):
    # Scan all the directories and create a list of labels
    labels = os.listdir(os.path.join(path, dataset))
    
    # Create lists for samples and labels
    X = list()
    y = list()
    
    # For each label folder
    for label in labels:
        # And for each image
        images = os.listdir(os.path.join(path, dataset, label))
        images = list(filter(lambda k: '.png' in k, images))

        for file in images:
            # Read the image
            image = cv2.imread(os.path.join(path, dataset, label, file), cv2.IMREAD_UNCHANGED)
    
            # Append it and label to the lists
            X.append(image)
            y.append(label)
    
    return np.array(X), np.array(y).astype('uint8')

In [26]:
# MNIST dataset (train + test)
def create_data_mnist(path):
    # Load both sets separately
    X, y = load_mnist_dataset('train', path)
    X_test, y_test = load_mnist_dataset('test', path)

    # Return data
    return X, y, X_test, y_test

In [27]:
# Create dataset
X, y, X_test, y_test = create_data_mnist('fashion_mnist_images')

In [28]:
# Shuffle data around to not bias
keys = np.array(range(X.shape[0]))
np.random.shuffle(keys)
X = X[keys]
y = y[keys]

# Reshape to vectors
X = X.reshape(X.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

# Scale features
X = (X.astype(np.float32) - 127.5) / 127.5
X_test = (X_test.astype(np.float32) - 127.5) / 127.5

In [29]:
# Instantiate the model
model = Model()

# Add layers
model.add(Layer_Dense(X.shape[1], 128))
model.add(Activation_ReLU())
model.add(Layer_Dense(128, 128))
model.add(Activation_ReLU())
model.add(Layer_Dense(128, 10))
model.add(Activation_Softmax())

# Set loss, optimiser, and accuracy objects
model.set(
    loss=Loss_CategoricalCrossentropy(),
    optimiser=Optimiser_Adam(decay=1e-4),
    accuracy=Accuracy_Categorical()
)

# Finalise
model.finalise()

# Train!
model.train(X, y, validation_data=(X_test, y_test), epochs=10, batch_size=128, print_every=100)

# Get params
parameters = model.get_parameters()

# Save model
model.save('fashion_mnist.model')

epoch: 1
step: 782, acc: 0.826, loss: 0.615 (data_loss: 0.615, reg_loss: 0.000), lr: 0.0009274717121127806
epoch: 2
step: 782, acc: 0.826, loss: 0.533 (data_loss: 0.533, reg_loss: 0.000), lr: 0.0008646779074794639
epoch: 3
step: 782, acc: 0.826, loss: 0.480 (data_loss: 0.480, reg_loss: 0.000), lr: 0.0008098477486232589
epoch: 4
step: 782, acc: 0.826, loss: 0.450 (data_loss: 0.450, reg_loss: 0.000), lr: 0.0007615566217348261
epoch: 5
step: 782, acc: 0.826, loss: 0.381 (data_loss: 0.381, reg_loss: 0.000), lr: 0.0007187005893344832
epoch: 6
step: 782, acc: 0.826, loss: 0.313 (data_loss: 0.313, reg_loss: 0.000), lr: 0.0006804109682248077
epoch: 7
step: 782, acc: 0.870, loss: 0.240 (data_loss: 0.240, reg_loss: 0.000), lr: 0.0006459948320413437
epoch: 8
step: 782, acc: 0.913, loss: 0.193 (data_loss: 0.193, reg_loss: 0.000), lr: 0.0006148927012236365
epoch: 9
step: 782, acc: 0.913, loss: 0.157 (data_loss: 0.157, reg_loss: 0.000), lr: 0.0005866478939340607
epoch: 10
step: 782, acc: 0.913, loss

In [36]:
fashion_mnist_labels = {
    0: 'T-shirt/top',
    1: 'Trouser',
    2: 'Pullover',
    3: 'Dress',
    4: 'Coat',
    5: 'Sandal',
    6: 'Shirt',
    7: 'Sneaker',
    8: 'Bag',
    9: 'Ankle boot'
}

In [37]:
X, y, X_test, y_test = create_data_mnist('fashion_mnist_images')

# Shuffle the training dataset
keys = np.array(range(X.shape[0]))
np.random.shuffle(keys)
X = X[keys]
y = y[keys]

# Scale and reshape samples
X = (X.reshape(X.shape[0], -1).astype(np.float32) - 127.5) / 127.5
X_test = (X_test.reshape(X_test.shape[0], -1).astype(np.float32) - 127.5) / 127.5

# Load the model
model = Model.load('fashion_mnist.model')

# Predict on the first 5 samples from validation dataset
confidences = model.predict(X_test[:5])
predictions = model.output_layer_activation.predictions(confidences)
for prediction in predictions:
    print(fashion_mnist_labels[prediction])

# Evaluate the model
model.evaluate(X_test, y_test)

Ankle boot
Ankle boot
Ankle boot
Ankle boot
Ankle boot
validation, acc: 0.881, loss: 0.356


#### Random tshirt

In [41]:
# Read image
image_data = cv2.imread('tshirt.png', cv2.IMREAD_GRAYSCALE)

# Resize to fit the same as Fashion MNIST
image_data = cv2.resize(image_data, (28, 28))

# Invert image colours
image_data = 255 - image_data

# Reshape and scale pixel data
image_data = (image_data.reshape(1, -1).astype(np.float32) - 127.5) / 127.5

# Load the model
model = Model.load('fashion_mnist.model')

# Predict on the image
confidences = model.predict(image_data)

# Get prediction instead of confidence levels
predictions = model.output_layer_activation.predictions(confidences)

# Get label name from label index
prediction = fashion_mnist_labels[predictions[0]]

print(prediction)

T-shirt/top


#### Random pants

In [42]:
# Read image
image_data = cv2.imread('pants.png', cv2.IMREAD_GRAYSCALE)

# Resize to fit the same as Fashion MNIST
image_data = cv2.resize(image_data, (28, 28))

# Invert image colours
image_data = 255 - image_data

# Reshape and scale pixel data
image_data = (image_data.reshape(1, -1).astype(np.float32) - 127.5) / 127.5

# Load the model
model = Model.load('fashion_mnist.model')

# Predict on the image
confidences = model.predict(image_data)

# Get prediction instead of confidence levels
predictions = model.output_layer_activation.predictions(confidences)

# Get label name from label index
prediction = fashion_mnist_labels[predictions[0]]

print(prediction)

Trouser
