In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
from utils.activations import softmax, sigmoid

np.random.seed(1)

#define constants
num_class = 4 # no of classes
hidden_layer_units = 25  #number of units in the hidden layer
epochs = 150
alpha = 0.001 #learning rate 


## DataSource
X_train, Y_train is parsed from csv files 

In [2]:
x = pd.read_csv(os.path.join('data','train_data.csv'), header=None).add_prefix('Feature_')
y = pd.read_csv(os.path.join('data', 'train_labels.csv'), header=None, names=["Label_0", "Label_1", "Label_2", "Label_3"])
X_train = x.values
Y_train = y.values
m, n = X_train.shape

## Layer and Forward Propagation
We then implement Layer and For-prop as follow

In [3]:
def Layer(A_in, W, B, g):
    """
    :param A_in: shape(m,n) - input data
    :param W: shape(feature,units) - weight matrix, n0 feature  x units, 
    :param b: shape(units,1) - bias vector, n0 units x 1
    :param g: activation function(e.g sigmoid, relu, softmax, ...)
    :return:
    A_out: shape(m, units): output data - m x units
    """
    Z = np.dot(A_in, W) + B
    A_out = g(Z)
    return Z,A_out

In [4]:
def compute_forward_prop(x, W1, b1, W2, b2):
    z1, a1 = Layer(x, W1, b1, sigmoid)  #hidden layer with sigmoid activation
    z2, a2 = Layer(a1, W2, b2, softmax) #output layer with softmax activation
    return z1, a1, z2, a2

## Initialize model parameters
The model parameters are intialized randomly as follow:

In [5]:

def initialize_model_params():
    W1 = np.random.randn(n, hidden_layer_units)
    b1 = np.random.randn(hidden_layer_units)
    W2 = np.random.randn(hidden_layer_units, num_class)
    b2 = np.random.randn(num_class)
    return b1, W1, b2, W2

## Back propagation

In [6]:
def sigmoid_derivative(x): 
    return sigmoid(x) * (1 - sigmoid(x))

def compute_backward_prop(Z1, A1, A2, W2,X, Y): 
    dz2 = A2 - Y
    dW2 = np.dot(A1.T,dz2)
    db2 = dz2
    dz1 = np.dot(dz2,W2.T)
    db1 = dz1 * sigmoid_derivative(Z1)
    dW1 = np.dot(X.T,dz1 * sigmoid_derivative(Z1))
    return db1, dW1, db2, dW2

## Gradient descent 

In [7]:
def update_model_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha_):
    if dW1.shape == (n, hidden_layer_units):
        W1 = W1 - alpha_ * dW1
    if dW2.shape == (hidden_layer_units, num_class):
        W2 = W2 - alpha_ * dW2
    if db1.shape == (m, hidden_layer_units):
        b1 = b1 - alpha_ * db1.sum(axis=0)
    if db2.shape == (m, num_class):
        b2 = b2 - alpha_ * db2.sum(axis=0)
    return b1, W1, b2, W2

## Cost function

In [8]:
def compute_cost(y_hat, y):
    return np.mean(np.square(y_hat - y))

## Prediction and accuracy
In order to analyze the results, the implementation of prediction and accuracy are described below:  

In [9]:
def predict_from_output_layer(A2):
    return np.argmax(A2, 0)

def predict(X, B1, W1, B2, W2):
    Z1 = np.dot(X, W1) + B1
    A1 = sigmoid(Z1)
    Z2 = np.dot(A1, W2) + B2
    A2 = softmax(Z2)
    prediction = np.argmax(A2, 0)
    return prediction

def accuracy(Y_hat, Y):
    # print(Y_hat, Y)
    return np.sum(Y_hat == Y) / Y.size

## Wrap everything up  

In [10]:
def training_data(X, Y, epochs, alpha):
    B1, W1, B2, W2 = initialize_model_params()
    for i in range(epochs):
        z1, a1, z2, a2 = compute_forward_prop(X, W1, B1, W2, B2)
        db1, dW1, db2, dW2 = compute_backward_prop(z1, a1, a2, W2, X, Y) 
        b1, W1, b2, W2 = update_model_params(W1, B1, W2, B2, dW1, db1, dW2, db2, alpha)

        cost = compute_cost(a2, Y)
        if i % 10 == 0:
            print("Iteration: ", i)
            print(f"cost = {cost}")
        # print(f"Accuracy: {accuracy(convert_to_one_hot(a2), Y)}")
    return b1, W1, b2, W2

Run the training process with $epochs=100$:

In [11]:
b1, W1, b2, W2 = training_data(X_train, Y_train, epochs, alpha)

Iteration:  0
0.32109911778100236
Iteration:  1
0.2752278437034565
Iteration:  2
0.37914989214566885
Iteration:  3
0.38036276965291066


  return 1 / (1 + np.exp(-X))


Iteration:  4
0.37616142828442234
Iteration:  5
0.35840069527685947
Iteration:  6
0.3654603285201286
Iteration:  7
0.38036262380675673
Iteration:  8
0.3742461564300916
Iteration:  9
0.3636277279110592
Iteration:  10
0.31072315764626157
Iteration:  11
0.2406317845753729
Iteration:  12
0.2668052067825854
Iteration:  13
0.3176878170303003
Iteration:  14
0.2558652744210048
Iteration:  15
0.25776751853551694
Iteration:  16
0.28244978181975217
Iteration:  17
0.2994792457295599
Iteration:  18
0.2486008393950285
Iteration:  19
0.2759140387739569
Iteration:  20
0.2066723040385362
Iteration:  21
0.27822122874207933
Iteration:  22
0.2416276490954042
Iteration:  23
0.36311546837731945
Iteration:  24
0.3457634742428219
Iteration:  25
0.27702203371190137
Iteration:  26
0.24708159181845168
Iteration:  27
0.27502806850224987
Iteration:  28
0.1373581805882618
Iteration:  29
0.17364967108947973
Iteration:  30
0.2270935094381853
Iteration:  31
0.1366159156549364
Iteration:  32
0.14253681948955121
Iterati

Now, we have the proper arguments for out the model. Let's try some unseen tests:

In [12]:
# X_test = np.random.rand(1, n)
# prediction = predict(X_test, b1, W1, b2, W2)
# print(f"prediction = {prediction}")
# print(f"W1 = {W1}")