Dataset and problem statement: "https://archive.ics.uci.edu/ml/datasets/Statlog+%28Shuttle%29" <br>
Code obtained from: <br>

Title: "Neural Networks from Scratch in Python" <br>
Authors: Harrison Kinsley & Daniel Kukieła <br>
Publisher: Harrison Kinsley, 2020

# Libraries

In [1]:
import numpy as np

# Fully Connected Layer

In [29]:
class Layer_FC: # Dense/fully connected

    def __init__(self,num_inputs,num_neurons):
        self.weights = 0.01*np.random.randn(num_inputs,num_neurons)
        self.biases = np.zeros((1,num_neurons))

    def forward(self,inputs):
        self.inputs = inputs
        self.output = np.dot(inputs,self.weights) + self.biases

    def backward(self,dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues,axis=0,keepdims=True)
        # Gradient on values
        self.dinputs = np.dot(dvalues,self.weights.T)

# Activation Functions - Rectilinear and Softmax

In [30]:
# Rectilinear activation function
class Activation_ReLU:

    def forward(self,inputs):
        self.inputs = inputs
        self.output = np.maximum(0,inputs)

    def backward(self,dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0


# Softmax activation for classification
class Activation_Softmax:

    def forward(self,inputs):
        # We take the subtraction of the max of the inputs to avoid "exploding values" and "dead neurons"
        exps = np.exp(inputs - np.max(inputs,axis=1,keepdims=True)) 
        probabilities = exps/np.sum(exps,axis=1,keepdims=True)
        self.output = probabilities

    def backward(self,dvalues):
        # Uninitialised array
        self.dinputs = np.empty_like(dvalues)
        # Enumerate outputs and gradients
        for index,(single_output,single_dvalues) in enumerate(zip(self.output,dvalues)):
            # Flatten output array 
            single_output = single_output.reshape(-1,1)  # coloumn matrix
            jacobian = np.diagflat(single_output) - np.dot(single_output,single_output.T)
            # Calculate the sample-wise gradient and add to array of sample gradients
            self.dinputs[index] = np.dot(jacobian,single_dvalues)

# Loss Functions

In [31]:
class Loss:
    
    def calculate(self,output,y):
        sample_losses = self.forward(output,y)
        data_loss = np.mean(sample_losses)
        return data_loss
    
class Loss_CategoricalCrossEntropy(Loss): # Inherits Loss

    def forward(self,y_pred,y_true):
        samples = len(y_pred)
        # CLip to prevent division by zero and clip both sides to drage mean to any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        # if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
                range(samples),
                y_true
            ]
        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped* y_true,
                axis=1
            )
        # Losses
        negative_log_probabs = -np.log(correct_confidences)
        return negative_log_probabs
    
    def backward(self, dvalues, y_true): ### We gonna only use one sample... use integer encoding instead
        samples = len(dvalues)
        # Number of labels in every sample, we use the first one to count ????
        lables = len(dvalues[0])
        # If lables are sparse, turn them into one-hot vector??
        if len(y_true.shape) == 1:
            y_true = np.eye(lables)[y_true]
        # Calculate gradient
        self.dinputs = -y_true/dvalues
        # Normalise gradient
        self.dinputs = self.dinputs/samples


class Activation_Softmax_Loss_CategoryCrossEntropy():

    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossEntropy()
        
    def forward(self,inputs,y_true):
        # Output layer's activation function
        self.activation.forward(inputs)
        self.output = self.activation.output ### ??????????????????
        # Calculate and return loss value
        return self.loss.calculate(self.output,y_true)
    
    def backward(self, dvalues, y_true): ### We gonna only use one sample... use integer encoding instead
        samples = len(dvalues)
        # If lables are one-hot encoded, turn them  into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[range(samples),y_true] -= 1
        # Normalise gradient
        self.dinputs = self.dinputs/samples

# Minimisation Method - Stochastic Gradient Descent

In [32]:
# Minimiser using Stochastic Gradient Descent, which assumes a batch of data. In this case it will work
class Optimizer_SGD:
    # Learning rate: stepsize
    # decay: reducing factor of stepsize

    def __init__(self, learning_rate, decay, momentum):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

    # Call once, before any parameter update
    def pre_update_params(self):
        # Checks if decay is not zero
        if self.decay:
            self.current_learning_rate = self.current_learning_rate*(1 + self.decay * self.iterations)

    def update_params(self,layer):
        # if we decide to select a momentum that is not zero
        if self.momentum:    
            # If layer does not coontain momentum arrays, create them filled with zeros of same shape as layer, same for bias_momentum
            if not hasattr(layer,'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.biases)
            # Build weight update with momentum
            weight_update = self.momentum*layer.weight_momentums - self.current_learning_rate*layer.dweights
            layer.weight_momentums = weight_update
            # Build bias update 
            bias_update = self.momentum*layer.bias_momentums - self.current_learning_rate*layer.dbiases
            layer.bias_momentums = bias_update 
        # Vanilla SGD, without momentum
        else: 
            weight_update = -self.current_learning_rate*layer.dweights
            bias_update = -self.current_learning_rate*layer.dbiases
        # Update weights and biases
        layer.weights += weight_update
        layer.biases += bias_update
    
    def post_update(self):
        self.iterations += 1

# Variables to tinker

In [51]:
learning_rate = 0.02
decay = 0
momentum = 0
hidden_neurons = 64
n_input_params = 9

# Training Model

In [49]:
# Open and read file
trainingFile = open('shuttle.trn','r')
lines = trainingFile.readlines()
batch = np.zeros((len(lines),9))
classfy = np.zeros(len(lines))

# Set up input as a 2d array
for i in range(len(lines)):
    temp_str = lines[i].split(' ')
    classfy[i] = int(temp_str[-1])
    for j in range(9):
        batch[i,j] = int(temp_str[j])
        
# Change category counting to start from 0 and convert to int
classfy = np.asarray(classfy, dtype = 'int') - 1

input_layer = Layer_FC(9,hidden_neurons)
activation1 = Activation_ReLU()

layer2 = Layer_FC(hidden_neurons,7)
loss_activation = Activation_Softmax_Loss_CategoryCrossEntropy()

optimizer = Optimizer_SGD(learning_rate,decay,momentum)
for epoch in range(5001):
    # Forward pass, for activation function too
    input_layer.forward(batch)
    activation1.forward(input_layer.output)
    # Forward pass on second layer, input is output of first layer (the activation output)
    layer2.forward(activation1.output)
    # Forward pass through activation/loss function 
    # Takes output of second layer and returns loss
    loss = loss_activation.forward(layer2.output,classfy)
    # Calculate the accuracy from output of activation2 and targets, along first axis
    predictions = np.argmax(loss_activation.output,axis=1)
    # if len(classfy.shape) == 2: **************

    accuracy = np.mean(predictions==classfy)
    if not epoch % 100:
        print(f'epoch:{epoch},'+
              f'acc:{accuracy},'+
              f'loss:{loss:.3f},'+
              f'lr:{optimizer.current_learning_rate}')
    # Backward pass
    loss_activation.backward(loss_activation.output,classfy)
    layer2.backward(loss_activation.dinputs)
    activation1.backward(layer2.dinputs)
    input_layer.backward(activation1.dinputs)
    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(input_layer)
    optimizer.update_params(layer2)
    optimizer.post_update()
    

epoch:0,acc:0.013724137931034483,loss:2.058,lr:0.02
epoch:100,acc:0.9209655172413793,loss:0.331,lr:0.02
epoch:200,acc:0.9257701149425287,loss:0.279,lr:0.02
epoch:300,acc:0.9164367816091954,loss:0.471,lr:0.02
epoch:400,acc:0.9533563218390805,loss:0.170,lr:0.02
epoch:500,acc:0.9637931034482758,loss:0.128,lr:0.02
epoch:600,acc:0.9677011494252874,loss:0.106,lr:0.02
epoch:700,acc:0.969816091954023,loss:0.090,lr:0.02
epoch:800,acc:0.9765057471264368,loss:0.074,lr:0.02
epoch:900,acc:0.9797011494252874,loss:0.065,lr:0.02
epoch:1000,acc:0.9810344827586207,loss:0.060,lr:0.02
epoch:1100,acc:0.9825747126436781,loss:0.052,lr:0.02
epoch:1200,acc:0.9833103448275862,loss:0.050,lr:0.02
epoch:1300,acc:0.9839080459770115,loss:0.047,lr:0.02
epoch:1400,acc:0.984735632183908,loss:0.043,lr:0.02
epoch:1500,acc:0.985264367816092,loss:0.039,lr:0.02
epoch:1600,acc:0.9857471264367816,loss:0.037,lr:0.02
epoch:1700,acc:0.986551724137931,loss:0.033,lr:0.02
epoch:1800,acc:0.9874942528735632,loss:0.030,lr:0.02
epoch:1

# Testing Model

In [54]:
# Open and read file
test_file = open('shuttle.tst','r')
lines_tst = test_file.readlines()
batch_tst = np.zeros((len(lines_tst),9))
classfy_tst = np.zeros(len(lines_tst))

# Set up input as a 2d array
for i in range(len(lines_tst)):
    temp_str = lines_tst[i].split(' ')
    classfy_tst[i] = int(temp_str[-1])
    for j in range(n_input_params):
        batch_tst[i,j] = int(temp_str[j])

# Change category counting to start from 0 and convert to int
classfy_tst = np.asarray(classfy_tst, dtype = 'int') - 1
# Forward pass
input_layer.forward(batch_tst)
activation1.forward(input_layer.output)
layer2.forwardx(activation1.output)
# Calculate loss
loss = loss_activation.forward(layer2.output,classfy_tst)
predictions = np.argmax(loss_activation.output,axis=1)
accuracy = np.mean(predictions==classfy_tst)
print(f'validation => acc:{accuracy:.3f},loss:{loss:.3f}')


validation,acc:0.998,loss:0.009
