In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
from utils.activations import softmax, sigmoid
from utils.data_parser import data_split_train_test
np.random.seed(1)

#define constants
num_class = 4 # no of classes
hidden_layer_units = 25  #number of units in the hidden layer
epochs = 200  #number of iterations
alpha = 0.001 #learning rate 


## DataSource
X_train, Y_train is parsed from csv files 

In [2]:
x = pd.read_csv(os.path.join('data','train_data.csv'), header=None).add_prefix('Feature_')
y = pd.read_csv(os.path.join('data', 'train_labels.csv'), header=None, names=["Label_0", "Label_1", "Label_2", "Label_3"])
X, Y, X_v, Y_v = data_split_train_test(x, y)
X_train = X.values   #training samples
Y_train = Y.values   #training labels

m, n = X_train.shape  #samples x features
X_Validate = X_v.values  #validation samples
Y_Validate = Y_v.values  #validation labels


## Layer and Forward Propagation
We then implement Layer and For-prop as follow

In [3]:
def Layer(A_in, W, B, g):
    """
    :param A_in: shape(m,n) - input data
    :param W: shape(feature,units) - weight matrix, n0 feature  x units, 
    :param b: shape(units,1) - bias vector, n0 units x 1
    :param g: activation function(e.g sigmoid, relu, softmax, ...)
    :return:
    A_out: shape(m, units): output data - m x units
    """
    Z = np.dot(A_in, W) + B
    A_out = g(Z)
    return Z,A_out

In [4]:
def compute_forward_prop(x, W1, b1, W2, b2):
    z1, a1 = Layer(x, W1, b1, sigmoid)  #hidden layer with sigmoid activation
    z2, a2 = Layer(a1, W2, b2, softmax) #output layer with softmax activation
    return z1, a1, z2, a2

## Initialize model parameters
The model parameters are intialized randomly as follow:

In [5]:

def initialize_model_params():
    W1 = np.random.randn(n, hidden_layer_units)
    b1 = np.random.randn(hidden_layer_units)
    W2 = np.random.randn(hidden_layer_units, num_class)
    b2 = np.random.randn(num_class)
    return b1, W1, b2, W2

## Back propagation

In [6]:
def sigmoid_derivative(x): 
    return sigmoid(x) * (1 - sigmoid(x))

def compute_backward_prop(Z1, A1, A2, W2,X, Y): 
    dz2 = A2 - Y
    dW2 = np.dot(A1.T,dz2)
    db2 = dz2
    dz1 = np.dot(dz2,W2.T)
    db1 = dz1 * sigmoid_derivative(Z1)
    dW1 = np.dot(X.T,dz1 * sigmoid_derivative(Z1))
    return db1, dW1, db2, dW2

## Gradient descent 

In [7]:
def update_model_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha_):
    W1 = W1 - alpha_ * dW1
    W2 = W2 - alpha_ * dW2
    b1 = b1 - alpha_ * db1.sum(axis=0)
    b2 = b2 - alpha_ * db2.sum(axis=0)
    return b1, W1, b2, W2

## Cost function

In [8]:
def compute_cost(y_hat, y):
    return np.mean(np.square(y_hat - y))

## Prediction, accuracy and one-hot encode
In order to analyze the results, the implementation of prediction and accuracy are described below:  

In [9]:
def predict_from_output_layer(A2):
    return np.argmax(A2, 0)

def predict(X, B1, W1, B2, W2):
    Z1 = np.dot(X, W1) + B1
    A1 = sigmoid(Z1)
    Z2 = np.dot(A1, W2) + B2
    A2 = softmax(Z2)
    prediction = np.argmax(A2, 0)
    return prediction

def compute_accuracy(Y_hat, Y):
    correct_count = sum((Y[i] == Y_hat[i]).all() for i in range(len(Y)))
    accuracy = correct_count / len(Y)
    return accuracy

def convert_to_one_hot(A):
    num_classes = A.shape[1]
    max_value = np.max(A) + 1
    one_hot_encoded = np.zeros((A.shape[0], num_classes), dtype=int)
    indices = np.argmax(A, axis=1)
    one_hot_encoded[np.arange(A.shape[0]), indices] = 1
    return one_hot_encoded

## Wrap everything up  

In [10]:
def training_data(X, Y, X_v, Y_v, epochs, alpha):
    b1, W1, b2, W2 = initialize_model_params()
    for i in range(epochs):
        z1, a1, z2, a2 = compute_forward_prop(X, W1, b1, W2, b2)
        db1, dW1, db2, dW2 = compute_backward_prop(z1, a1, a2, W2, X, Y) 
        b1, W1, b2, W2 = update_model_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)

        cost = compute_cost(a2, Y)
        #avoid overfitting 
        _,_,_,y_hat = compute_forward_prop(X_v, W1, b1, W2, b2)
        y_hat_encoded = convert_to_one_hot(y_hat)
        accuracy = compute_accuracy(y_hat_encoded, Y_v)
        if i % 10 == 0:
            print("Iteration: ", i)
            print(f"cost = {cost}  accuracy={accuracy * 100}")
        # print(f"Accuracy: {accuracy(convert_to_one_hot(a2), Y)}")
    return b1, W1, b2, W2

Run the training process with $epochs=150$:

In [14]:
b1, W1, b2, W2 = training_data(X_train, Y_train, X_Validate, Y_Validate, epochs, alpha)

Iteration:  0
cost = 0.3204854569629083  accuracy=40.537265198949704
Iteration:  10
cost = 0.37141426065680894  accuracy=23.20743284185013
Iteration:  20
cost = 0.2488774359580971  accuracy=44.29408200363562
Iteration:  30
cost = 0.18221521111925298  accuracy=70.63219551605737
Iteration:  40
cost = 0.12435261959095559  accuracy=70.6523934558675
Iteration:  50
cost = 0.13528077211669987  accuracy=71.1169460715007
Iteration:  60
cost = 0.030364072847876625  accuracy=91.55726115936174
Iteration:  70
cost = 0.03971068332376786  accuracy=92.68834578872955
Iteration:  80
cost = 0.02438133269940144  accuracy=94.64754595031307
Iteration:  90
cost = 0.025728821849999483  accuracy=93.49626338113512
Iteration:  100
cost = 0.02354410228287669  accuracy=93.75883659866695
Iteration:  110
cost = 0.022973567812959717  accuracy=93.83962835790749
Iteration:  120
cost = 0.022233967364524355  accuracy=94.04160775600889
Iteration:  130
cost = 0.019935447156864874  accuracy=95.55645324176933
Iteration:  140

Now, we have the proper arguments for out the model. Let's try some unseen tests:

In [12]:
# X_test = np.random.rand(1, n)
# prediction = predict(X_test, b1, W1, b2, W2)
# print(f"prediction = {prediction}")
# print(f"W1 = {W1}")