In [1]:
import matplotlib.pyplot as plt
import numpy as np
import nnfs
from nnfs.datasets import spiral_data, sine_data

In [2]:
nnfs.init()

In [27]:
# Dense Layer
class Layer_Dense:

    # Layer initialisation
    def __init__(self, n_inputs, n_neurons, weight_regulariser_l1=0, weight_regulariser_l2=0, bias_regulariser_l1=0, bias_regulariser_l2=0):
        # Initialise the weights and the biases
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        # Set regularisation strength
        self.weight_regulariser_l1 = weight_regulariser_l1
        self.weight_regulariser_l2 = weight_regulariser_l2
        self.bias_regulariser_l1 = bias_regulariser_l1
        self.bias_regulariser_l2 = bias_regulariser_l2

    # Forward pass
    def forward(self, inputs, training):
        # Remember input values
        self.inputs = inputs
        # Calculate output values from inputs, weights, and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backpropagation
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)

        # Gradients on regularisation, L1 on weights
        if self.weight_regulariser_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights < 0] = -1
            self.dweights += self.weight_regulariser_l1 * dL1
        # L2 on weights
        if self.weight_regulariser_l2 > 0:
            self.dweights += 2 * self.weight_regulariser_l2 * self.weights

        # L1 on biases
        if self.bias_regulariser_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases < 0] = -1
            self.dbiases += self.bias_regulariser_l1 * dL1
        # L2 on biases
        if self.bias_regulariser_l2 > 0:
            self.dbiases += 2 * self.bias_regulariser_l2 * self.biases

        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

In [4]:
# Dropout
class Layer_Dropout:

    # Init
    def __init__(self, rate):
        # Store rate, we invert it as for example for dropout of 0.1 we need a success rate of 0.9
        self.rate = 1 - rate

    # Forward pass
    def forward(self, inputs, training):
        # Save input values
        self.inputs = inputs

        # If not in the training mode - return values
        if not training:
            self.output = inputs.copy()
            return
            
        # Generate and save scaled mask
        self.binary_mask = np.random.binomial(1, self.rate, size=inputs.shape) / self.rate
        #Apply mask to output values
        self.output = inputs * self.binary_mask

    # Backwards pass
    def backward(self, dvalues):
        # Gradient on values
        self.dinputs = dvalues * self.binary_mask

In [5]:
# Input "layer"
class Layer_Input:

    # Forward pass
    def forward(self, inputs, training):
        self.output = inputs

In [6]:
# ReLU activation
class Activation_ReLU:

    # Forward pass
    def forward(self, inputs, training):
        # Remember inputs
        self.inputs = inputs
        # Calculate output values from inputs
        self.output = np.maximum(0, inputs)

    # Backpropagation
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()

        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

    # Calculate predictions for outputs
    def predictions(self, outputs):
        return outputs

In [7]:
# Softmax activation
class Activation_Softmax:

    # Forward pass
    def forward(self, inputs, training):
        # Remember input values
        self.inputs = inputs

        # Get unnormalised probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))

        # Normalise them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)

        self.output = probabilities

    # Backpropagation
    def backward(self, dvalues):
        # Create an uninitialised array
        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1, 1)
            # Calculate the Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            # Then calculate the sample-wise gradient and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

    # Calculate predictions for outputs
    def predictions(self, outputs):
        return np.argmax(outputs, axis=1)

In [8]:
# Sigmoid Activation
class Activation_Sigmoid:

    # Forward pass
    def forward(self, inputs, training):
        # Save inputs and calculate outputs
        self.inputs = inputs
        self.output = 1 / (1 + np.exp(-inputs))

    # Backward pass
    def backward(self, dvalues):
        # Derivative - calculates from the output of the sigmoid function
        self.dinputs = dvalues * (1 - self.output) * self.output

    # Calculate predictions for outputs
    def predictions(self, outputs):
        return (outputs > 0.5) * 1

In [9]:
# Linear activation
class Activation_Linear:

    # Forward pass
    def forward(self, inputs, training):
        self.inputs = inputs
        self.output = inputs

    # Backward pass
    def backward(self, dvalues):
        # derivative is 1, 1 * dvalues = dvalues
        self.dinputs = dvalues.copy()

    # Calculate predictions for outputs
    def predictions(self, outputs):
        return outputs

In [10]:
# Stochastic Gradient Optimiser
class Optimiser_SGD:

    # Initialise parameters - set settings, learning_rate=1. is default for this optimiser
    def __init__(self, learning_rate=1.0, decay=0., momentum=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):
        
        # If we use momentum
        if self.momentum:

            # If the layer does not contain momentum arrays, create them filled with zeros
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                # If there is no momentum array for weights, then the array doesn't exist for biases either
                layer.bias_momentums = np.zeros_like(layer.biases)

            # Build weight updates with momentum - take previous updates multiplied by retain factor and update with current gradients
            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates

            # Build bias updates
            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates

        # Original SGD without momentum
        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases

        # Update weights and biases using either original or momentum updates
        layer.weights += weight_updates
        layer.biases += bias_updates

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

In [11]:
class Optimiser_Adagrad:

    # Initialise optimiser - set settings
    def __init__(self, learning_rate=1., decay=0.,  epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):

        # If layer does not contain cache arrays, create them filled with zeros
        if not hasatrr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update cache with squared current gradients
        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dbiases ** 2

        # Vanilla SGD parameter update + normalisation with square rooted cache
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1

In [12]:
# RMSprop optimiser
class Optimiser_RMSprop:

    # Initialise optimiser - set settings
    def __init__(self, learning_rate=0.001, decay=0.,  epsilon=1e-7, rho=0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):

        # If layer does not contain cache arrays, create them filled with zeros
        if not hasatrr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update cache with squared current gradients
        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights**2
        layer.bias_cache = self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases**2

        # Vanilla SGD parameter update + normalisation with square rooted cache
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1

In [13]:
class Optimiser_Adam:

    # Initialise optimiser - set settings
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    # Call once before any paramter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update params
    def update_params(self, layer):

        # If the layer does not contain cache arrays, create them filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update momentum with current gradients
        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases

        # Get corrected momentum, self.iteration is 0 at first pass and we need it to start with 1
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        # Update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2

        # Get corrected cache
        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))

        # OG SGD parameter update + normalisation with square rooted cache
        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

In [14]:
# Common loss class
class Loss:

    # Regularisation loss calculation
    def regularisation_loss(self):

        # 0 by default
        regularisation_loss = 0

        # Calculate regularisation loss iterated over all trainable layers
        for layer in self.trainable_layers:

            # L1 regularisation - abs(weights)
            # calculate only when factor is greater than 0
            if layer.weight_regulariser_l1 > 0:
                regularisation_loss += layer.weight_regulariser_l1 * np.sum(np.abs(layer.weights))
    
            # L2 regularisaiton - weights ** 2
            if layer.weight_regulariser_l2 > 0:
                regularisation_loss += layer.weight_regulariser_l2 * np.sum(layer.weights * layer.weights)
    
            # L1 regularisation - abs(biases)
            # calculate only when factor is greater than 0
            if layer.bias_regulariser_l1 > 0:
                regularisation_loss += layer.bias_regulariser_l1 * np.sum(np.abs(layer.biases))
    
            # L2 regularisation - biases ** 2
            if layer.bias_regulariser_l2 > 0:
                regularisation_loss += layer.bias_regulariser_l2 * np.sum(layer.biases * layer.biases)

        return regularisation_loss

    # Set/remember trainable layers
    def remember_trainable_layers(self, trainable_layers):
        self.trainable_layers = trainable_layers

    # Calculates the data and regularisation losses given model output and ground truth values
    def calculate(self, output, y, *, include_regularisation=False):

        # Calculate samples losses
        sample_losses = self.forward(output, y)

        # Calculate mean loss
        data_loss = np.mean(sample_losses)

        # If just data loss - return it
        if not include_regularisation:
            return data_loss

        # Return the data and regularisation loss
        return data_loss, self.regularisation_loss()

In [15]:
# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):

    # Forward pass
    def forward(self, y_pred, y_true):

        # Number of samples in a batch
        samples = len(y_pred)

        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values - only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
            range(samples),
            y_true
        ]

        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis=1
            )

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    # Backpropagation
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)

        # Number of labels in every sample, we'll use the first sample to count them
        labels = len(dvalues[0])

        # If labels are sparse, turn them into one-hot vectors
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculate gradient
        self.dinputs = -y_true / dvalues
        # Normalise
        self.dinputs = self.dinputs / samples

In [25]:
# Softmax classifier - combined softmax activation and cross-entropy loss for a faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy():

    # Backpropagation
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)

        # Copy
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # Normalise gradient
        self.dinputs = self.dinputs / samples

In [17]:
# Binary Cross-Entropy Loss
class Loss_BinaryCrossentropy(Loss):

    # Forward pass
    def forward(self, y_pred, y_true):
        # Clip data
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Calculate sample-wise loss
        sample_losses = -(y_true * np.log(y_pred_clipped) + (1 - y_true) * np.log(1 - y_pred_clipped))
        sample_losses = np.mean(sample_losses, axis=-1)

        # Return losses
        return sample_losses

    # Backward pass
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)

        # Number of outputs, used first to count them
        outputs = len(dvalues[0])

        # Clip
        clipped_dvalues = np.clip(dvalues, 1e-7, 1 - 1e-7)

        # Calculate gradient
        self.dinputs = -(y_true / clipped_dvalues - (1 - y_true) / (1 - clipped_dvalues)) / outputs

        # Normalise
        self.dinputs = self.dinputs / samples

In [18]:
# Mean Squared Error Loss
class Loss_MeanSquaredError(Loss):

    # Forward pass
    def forward(self, y_pred, y_true):
        # Calculate loss
        sample_losses = np.mean((y_true - y_pred)**2, axis=-1)

        # Return losses
        return sample_losses

    # Backward pass
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)

        # Number of outputs
        outputs = len(dvalues[0])

        # Gradient on values
        self.dinputs = -2 * (y_true - dvalues) / outputs
        # Normalise
        self.dinputs = self.dinputs / samples

In [19]:
# Mean Absolute Error Loss
class Loss_MeanAbsoluteError(Loss): # L1 loss

    # Forward pass
    def forward(self, y_pred, y_true):
        # Calculate loss
        sample_losses = np.mean(np.abs(y_true - y_pred), axis=-1)

        # Return losses
        return sample_losses

    # Backward pass
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        # Number of outputs in every sample
        outputs = len(dvalues[0])

        # Calculate gradient
        self.dinputs = np.sign(y_true - dvalues) / outputs
        # Normalise
        self.dinputs = self.dinputs / samples

In [20]:
# Common accuracy class
class Accuracy:

    # Calculate an accuracy given predictions and groudn truth values
    def calculate(self, predictions, y):
        # Get comparison results
        comparisons = self.compare(predictions, y)

        # Calculate accuracy
        accuracy = np.mean(comparisons)

        # Return accuracy
        return accuracy

In [21]:
# Accuracy calculation for regression model
class Accuracy_Regression(Accuracy):

    def __init__(self):
        # Create precision property
        self.precision = None

    # Calculate precision value based on passed in ground truth
    def init(self, y, reinit=False):
        if self.precision is None or reinit:
            self.precision = np.std(y) / 250

    # Compares predictions to the ground truth values
    def compare(self, predictions, y):
        return np.absolute(predictions - y) < self.precision

In [22]:
# Accuracy calculation for classification model
class Accuracy_Categorical(Accuracy):

    # No initialisation is needed
    def init(self, y):
        pass

    # Compare predictions to the ground truth values
    def compare(self, predictions, y):
        if len(y.shape) == 2:
            y = np.argmax(y, axis=1)
        return predictions == y

In [23]:
# Model class
class Model:

    def __init__(self):
        # Create a list of network objects
        self.layers = []
        # Softmax classifier's output object
        self.softmax_classifier_output = None

    # Add objects to the model
    def add(self, layer):
        self.layers.append(layer)

    # Set loss and optimiser
    def set(self, *, loss, optimiser, accuracy):
        self.loss = loss
        self.optimiser = optimiser
        self.accuracy = accuracy

    # Finalise the model
    def finalise(self):
        # Create and set the input layer
        self.input_layer = Layer_Input()

        # Count all the objects
        layer_count = len(self.layers)

        # Initialise a list containing trainable layers
        self.trainable_layers = []

        # Iterate the objects
        for i in range(layer_count):
            # If it's the first layer, the previous layer object is the input layer
            if i == 0:
                self.layers[i].prev = self.input_layer
                self.layers[i].next = self.layers[i+1]

            # All layers except for the first and the last
            elif i < layer_count - 1:
                self.layers[i].prev = self.layers[i-1]
                self.layers[i].next = self.layers[i+1]

            # The last layer, next object is the loss
            else:
                self.layers[i].prev = self.layers[i-1]
                self.layers[i].next = self.loss
                self.output_layer_activation = self.layers[i]

            # If layer contains an attribute called weights, it's a trainable layer
            if hasattr(self.layers[i], 'weights'):
                self.trainable_layers.append(self.layers[i])

            # Update loss object with trainable layers
            self.loss.remember_trainable_layers(
                self.trainable_layers
            )

        # If output activation is Softmax and loss function is categorical cross-entropy, create an object
        if isinstance(self.layers[-1], Activation_Softmax) and isinstance(self.loss, Loss_CategoricalCrossentropy):
            # Create an object of combined activation and loss functions
            self.softmax_classifier_output = Activation_Softmax_Loss_CategoricalCrossentropy()

    # Train the model
    def train(self, X, y, *, epochs=1, print_every=1, validation_data=None):
        # Initialise accuracy object
        self.accuracy.init(y)
        
        # Main training loop
        for epoch in range(1, epochs+1):
            # Perform the forward pass
            output = self.forward(X, training=True)

            # Calculate loss
            data_loss, regularisation_loss = self.loss.calculate(output, y, include_regularisation=True)
            loss = data_loss + regularisation_loss

            # Get predictions and calculate accuracy
            predictions = self.output_layer_activation.predictions(output)
            accuracy = self.accuracy.calculate(predictions, y)

            # Perform backward pass
            self.backward(output, y)
    
            # Optimise (update params)
            self.optimiser.pre_update_params()
            for layer in self.trainable_layers:
                self.optimiser.update_params(layer)
            self.optimiser.post_update_params()

            # Print a summary
            if not epoch % print_every:
                print ( f'epoch: {epoch}, ' +
                        f'acc: {accuracy:.3f}, ' +
                        f'loss: {loss:.3f} (' +
                        f'data_loss: {data_loss:.3f}, ' +
                        f'reg_loss: {regularisation_loss:.3f}), ' +
                        f'lr: {self.optimiser.current_learning_rate}' )

        if validation_data is not None:
            # For better readability
            X_val, y_val = validation_data

            # Perform the forward pass
            output = self.forward(X_val, training=False)

            # Calculate loss
            loss = self.loss.calculate(output, y_val)

            # Get predictions and calculate an accuracy
            predictions = self.output_layer_activation.predictions(output)
            accuracy = self.accuracy.calculate(predictions, y_val)

            # Print
            print(f'validation, ' +
                  f'acc: {accuracy:.3f}, ' +
                  f'loss: {loss:.3f}')

    # Forward pass
    def forward(self, X, training):
        # Call forward method on the input layer
        self.input_layer.forward(X, training)

        # Call forward method of every object in chain
        for layer in self.layers:
            layer.forward(layer.prev.output, training)

        # Return output
        return layer.output

    # Backwards pass
    def backward(self, output, y):
        # If softmax classifier
        if self.softmax_classifier_output is not None:
            # First call backward method on the combined activation/loss
            self.softmax_classifier_output.backward(output, y)
            self.layers[-1].dinputs = self.softmax_classifier_output.dinputs

            # Call backward method going through all the objects but last in reversed order
            for layer in reversed(self.layers[:-1]):
                layer.backward(layer.next.dinputs)
            return
        
        # First call backwards method on the loss
        self.loss.backward(output, y)

        # Call backward method going through the objects in revers
        for layer in reversed(self.layers):
            layer.backward(layer.next.dinputs)

In [28]:
X, y = spiral_data(samples=1000, classes=3)
X_test, y_test = spiral_data(samples=100, classes=3)

# Instantiate the model
model = Model()

# Add layers
model.add(Layer_Dense(2, 512, weight_regulariser_l2=5e-4, bias_regulariser_l2=5e-4))
model.add(Activation_ReLU())
model.add(Layer_Dropout(0.1))
model.add(Layer_Dense(512, 3))
model.add(Activation_Softmax())

model.set(
    loss=Loss_CategoricalCrossentropy(),
    optimiser=Optimiser_Adam(learning_rate=0.05, decay=5e-5),
    accuracy=Accuracy_Categorical()
)

# Finalise the model
model.finalise()

# Train the model
model.train(X, y, validation_data=(X_test, y_test), epochs=10000, print_every=100)

epoch: 100, acc: 0.717, loss: 0.722 (data_loss: 0.659, reg_loss: 0.063), lr: 0.04975371909050202
epoch: 200, acc: 0.774, loss: 0.624 (data_loss: 0.545, reg_loss: 0.079), lr: 0.049507401356502806
epoch: 300, acc: 0.794, loss: 0.614 (data_loss: 0.532, reg_loss: 0.082), lr: 0.0492635105177595
epoch: 400, acc: 0.832, loss: 0.557 (data_loss: 0.475, reg_loss: 0.082), lr: 0.04902201088288642
epoch: 500, acc: 0.836, loss: 0.513 (data_loss: 0.434, reg_loss: 0.079), lr: 0.048782867456949125
epoch: 600, acc: 0.836, loss: 0.528 (data_loss: 0.451, reg_loss: 0.077), lr: 0.04854604592455945
epoch: 700, acc: 0.852, loss: 0.486 (data_loss: 0.409, reg_loss: 0.077), lr: 0.048311512633460556
epoch: 800, acc: 0.848, loss: 0.499 (data_loss: 0.426, reg_loss: 0.073), lr: 0.04807923457858551
epoch: 900, acc: 0.862, loss: 0.478 (data_loss: 0.403, reg_loss: 0.075), lr: 0.04784917938657352
epoch: 1000, acc: 0.845, loss: 0.479 (data_loss: 0.407, reg_loss: 0.072), lr: 0.04762131530072861
epoch: 1100, acc: 0.848, lo

In [29]:
X, y = spiral_data(samples=100, classes=2)
X_test, y_test = spiral_data(samples=100, classes=2)

# Reshape labels to be a list of lists
y = y.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

# Instantiate the model
model = Model()

# Add layers
model.add(Layer_Dense(2, 64, weight_regulariser_l2=5e-4, bias_regulariser_l2=5e-4))
model.add(Activation_ReLU())
model.add(Layer_Dense(64, 1))
model.add(Activation_Sigmoid())

# Set loss, optimiser and accuracy objects
model.set(
    loss=Loss_BinaryCrossentropy(),
    optimiser=Optimiser_Adam(decay=5e-7),
    accuracy=Accuracy_Categorical()
)

# Finalise the model
model.finalise()

# Train the model
model.train(X, y, validation_data=(X_test, y_test), epochs=10000, print_every=100)

epoch: 100, acc: 0.430, loss: 0.663 (data_loss: 0.663, reg_loss: 0.001), lr: 0.0009999505024501287
epoch: 200, acc: 0.460, loss: 0.657 (data_loss: 0.656, reg_loss: 0.001), lr: 0.0009999005098992651
epoch: 300, acc: 0.465, loss: 0.654 (data_loss: 0.653, reg_loss: 0.002), lr: 0.000999850522346909
epoch: 400, acc: 0.475, loss: 0.651 (data_loss: 0.649, reg_loss: 0.002), lr: 0.0009998005397923115
epoch: 500, acc: 0.485, loss: 0.645 (data_loss: 0.642, reg_loss: 0.003), lr: 0.0009997505622347225
epoch: 600, acc: 0.505, loss: 0.638 (data_loss: 0.634, reg_loss: 0.004), lr: 0.0009997005896733929
epoch: 700, acc: 0.500, loss: 0.629 (data_loss: 0.623, reg_loss: 0.005), lr: 0.0009996506221075735
epoch: 800, acc: 0.505, loss: 0.618 (data_loss: 0.611, reg_loss: 0.007), lr: 0.000999600659536515
epoch: 900, acc: 0.510, loss: 0.608 (data_loss: 0.599, reg_loss: 0.009), lr: 0.0009995507019594694
epoch: 1000, acc: 0.515, loss: 0.599 (data_loss: 0.588, reg_loss: 0.012), lr: 0.000999500749375687
epoch: 1100,

In [30]:
# Create dataset
X, y = sine_data()

# Instanitate the model
model = Model()

# Add layers
model.add(Layer_Dense(1, 64))
model.add(Activation_ReLU())
model.add(Layer_Dense(64, 64))
model.add(Activation_ReLU())
model.add(Layer_Dense(64, 1))
model.add(Activation_Linear())

# Set loss and optimiser objects
model.set(
    loss=Loss_MeanSquaredError(),
    optimiser=Optimiser_Adam(learning_rate=0.005, decay=1e-3),
    accuracy=Accuracy_Regression()
)

# Finalise the model
model.finalise()

# Train the model
model.train(X, y, epochs=10000, print_every=100)

epoch: 100, acc: 0.005, loss: 0.083 (data_loss: 0.083, reg_loss: 0.000), lr: 0.004549590536851684
epoch: 200, acc: 0.055, loss: 0.031 (data_loss: 0.031, reg_loss: 0.000), lr: 0.004170141784820684
epoch: 300, acc: 0.013, loss: 0.002 (data_loss: 0.002, reg_loss: 0.000), lr: 0.003849114703618168
epoch: 400, acc: 0.103, loss: 0.000 (data_loss: 0.000, reg_loss: 0.000), lr: 0.0035739814152966403
epoch: 500, acc: 0.584, loss: 0.000 (data_loss: 0.000, reg_loss: 0.000), lr: 0.00333555703802535
epoch: 600, acc: 0.620, loss: 0.000 (data_loss: 0.000, reg_loss: 0.000), lr: 0.0031269543464665416
epoch: 700, acc: 0.675, loss: 0.000 (data_loss: 0.000, reg_loss: 0.000), lr: 0.002942907592701589
epoch: 800, acc: 0.680, loss: 0.000 (data_loss: 0.000, reg_loss: 0.000), lr: 0.0027793218454697055
epoch: 900, acc: 0.694, loss: 0.000 (data_loss: 0.000, reg_loss: 0.000), lr: 0.0026329647182727752
epoch: 1000, acc: 0.021, loss: 0.000 (data_loss: 0.000, reg_loss: 0.000), lr: 0.002501250625312656
epoch: 1100, acc