In [15]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os
from utils.activations import softmax, relu

sns.set()

#define constants
m = 24754  # no of examples
n = 784 # no of features 
num_class = 4 # no of classes
hidden_layer_units = 10 


## DataSource
X_train, Y_train is parsed from csv files 

In [14]:
x = pd.read_csv(os.path.join('data','train_data.csv'), header=None).add_prefix('Feature_')
y = pd.read_csv(os.path.join('data', 'train_labels.csv'), header=None, names=["Label_0", "Label_1", "Label_2", "Label_3"])
X_train = x.to_numpy()
Y_train = y.to_numpy()


## Layer and Forward Propagation
We then implement Layer and For-prop as follow

In [16]:
def Layer(A_in, W, B, g):
    """
    :param A_in: shape(m,n) - input data
    :param W: shape(feature,units) - weight matrix, n0 feature  x units, 
    :param b: shape(units,1) - bias vector, n0 units x 1
    :param g: activation function(e.g sigmoid, relu, softmax, ...)
    :return:
    A_out: shape(m, units): output data - m x units
    """
    Z = np.matmul(A_in, W) + B
    A_out = g(Z)
    return Z,A_out

In [17]:
def Sequence(x, W1, b1, W2, b2):
    z1, a1 = Layer(x, W1, b1, relu)  #hidden layer with relu activation
    z2, a2 = Layer(a1, W2, b2, softmax) #output layer with softmax activation
    return z1, a1, z2, a2

## Initialize model parameters
The model parameters are intialized randomly as follow:

In [18]:

def initialize_model_params():
    W1 = np.random.rand(n, hidden_layer_units)
    B1 = np.random.rand(1, hidden_layer_units)
    W2 = np.random.rand(hidden_layer_units, num_class)
    B2 = np.random.rand(1, num_class)
    return B1, W1, B2, W2

## Back propagation

In [29]:
def relu_derivative(x): 
    return np.where(x >= 0, 1, 0)

def compute_back_prop(Z1, Z2, A1, A2, W2,X, Y):
    m = Y.shape[0]
    dz2 = A2 - Y
    
    dW2 = (1/m) * np.matmul(A1.T,dz2)
    print(f"dW2 shape={dW2.shape}")
    db2 = (1/m)* np.sum(dz2, axis=0)
    print(f"db2 shape={db2.shape}")
    print(f"W2.T shape={W2.T.shape}")
    print(f"relu_derivative shape={relu_derivative(Z1).shape}")
    dz1 = np.matmul(dz2,W2.T) * relu_derivative(Z1)
    db1 = (1/m) * np.sum(dz1, axis=0)
    dW1 = (1/m) * np.matmul(X.T,dz1)
    return db1, dW1, db2, dW2

## Gradient descent 

In [20]:
def compute_gradient_descent(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha_):
    W1 = W1 - alpha_ * dW1
    b1 = b1 - alpha_ * db1
    W2 = W2 - alpha_ * dW2
    b2 = b2 - alpha_ * db2
    return b1, W1, b2, W2

## Wrap everything up  

In [21]:
def training_data(X, Y, epochs, alpha):
    B1, W1, B2, W2 = initialize_model_params()
    for i in range(epochs):
        z1, a1, z2, a2 = Sequence(X, W1, B1, W2, B2)   
        db1, dW1, db2, dW2 = compute_back_prop(z1, z2, a1, a2, W2, X, Y)
        b1, W1, b2, W2 = compute_gradient_descent(W1, B1, W2, B2, dW1, db1, dW2, db2, alpha)
        if i % 10 == 0:
            print("Iteration: ", i)
    return b1, W1, b2, W2

Run the training process:

In [31]:
b1, W1, b2, W2 = training_data(X_train, Y_train, 100, 0.01)

dW2 shape=(10, 4)
db2 shape=(4,)
W2.T shape=(4, 10)
relu_derivative shape=(24754, 10)
Iteration:  0
dW2 shape=(10, 4)
db2 shape=(4,)
W2.T shape=(4, 10)
relu_derivative shape=(24754, 10)
dW2 shape=(10, 4)
db2 shape=(4,)
W2.T shape=(4, 10)
relu_derivative shape=(24754, 10)
dW2 shape=(10, 4)
db2 shape=(4,)
W2.T shape=(4, 10)
relu_derivative shape=(24754, 10)
dW2 shape=(10, 4)
db2 shape=(4,)
W2.T shape=(4, 10)
relu_derivative shape=(24754, 10)
dW2 shape=(10, 4)
db2 shape=(4,)
W2.T shape=(4, 10)
relu_derivative shape=(24754, 10)
dW2 shape=(10, 4)
db2 shape=(4,)
W2.T shape=(4, 10)
relu_derivative shape=(24754, 10)
dW2 shape=(10, 4)
db2 shape=(4,)
W2.T shape=(4, 10)
relu_derivative shape=(24754, 10)
dW2 shape=(10, 4)
db2 shape=(4,)
W2.T shape=(4, 10)
relu_derivative shape=(24754, 10)
dW2 shape=(10, 4)
db2 shape=(4,)
W2.T shape=(4, 10)
relu_derivative shape=(24754, 10)
dW2 shape=(10, 4)
db2 shape=(4,)
W2.T shape=(4, 10)
relu_derivative shape=(24754, 10)
Iteration:  10
dW2 shape=(10, 4)
db2 sha