# Part B (Neural Network from Scratch)

You need to implement a neural network from scratch .This is a multiclass classification problem. No. of hidden layers depends on you but should be atleast 2.Remember to use activation function. You can add any other function of your choice.

In [1049]:
import numpy as np
import random
import matplotlib.pyplot as plt
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [1050]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
y = to_categorical(y)

In [1051]:
print(X.shape,y.shape)    # three classes to predict.

(150, 4) (150, 3)


In [1052]:
input_size = X.shape[1]
hidden_size1 = 16
hidden_size2 = 32
output_size = y.shape[1]

def parameters(input_size, hidden_size1, hidden_size2, output_size):
    # define the parameters of your nn initially using random lib.
  
    # between input layer and 1st hidden layer.
    w1 = np.random.randn(input_size, hidden_size1)    # 16 neurons in 1st hidden layer.
    b1 = np.zeros(hidden_size1)

    # between 1st hidden layer and 2nd hidden layer.
    w2 = np.random.randn(hidden_size1, hidden_size2)    # 32 neurons in 2nd hidden layer.
    b2 = np.zeros(hidden_size2)

    # between 2nd hidden layer and output layer.
    w3 = np.random.randn(hidden_size2, output_size)    # 3 neurons in output layer.
    b3 = np.zeros(output_size)
  
    return w1,b1,w2,b2,w3,b3

In [1053]:
#activation functions
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def relu(z):
    return np.maximum(0.0, z)    #returns maximum of 0.0 and z.(whereas max returns maximum element of an array.)

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))        # we subtract max(z) to maintain 'numerical stability' as said on
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)         # stackoverflow as it eventually cancels out and does not change the answer.
                                                                # keepdims makes sure the result is broadcasted properly.(i.e correct dimensions)
def sigmoid_derivative(z):
    return z * (1 - z)   

def relu_derivative(z):
    return (z > 0).astype(float)       # 0 or 1                                                 

In [1054]:
def forward(X, w1,b1, w2,b2 ,w3,b3):
    # function for forward propagation

    # 1st hidden layer
    y1 = np.dot(X, w1) + b1   # input
    z1 = sigmoid(y1)          # output

    # 2nd hidden layer
    y2 = np.dot(z1, w2) + b2  # input
    z2 = relu(y2)             # output

    # output layer
    y3 = np.dot(z2, w3) + b3  # input
    z3 = softmax(y3)          # output
    
    initial_values = (y1, z1, y2, z2, y3, z3)

    return z3, initial_values


In [1055]:
def cost_funct(y_true, y_pred):
    # function for cost func if necessary

    # using categorical crossentropy as our output layer has softmax (ont hot encoded).
    m = y_true.shape[0]     # number of examples

    cost = -np.sum(y_true * np.log(y_pred)) / m
    return cost

# Source - https://neuralthreads.medium.com/categorical-cross-entropy-loss-the-most-important-loss-function-d3792151d05b

In [1056]:
def backward(X, y, values, w1, w2, w3):
    # function for backward propagation
    m = y.shape[0]
    y1, z1, y2, z2, y3, z3 = values

    dy3 = z3-y
    dw3 = np.dot(z2.T, dy3)/m
    db3 = np.sum(dy3, axis=0)/m

    dz2 = np.dot(dy3, w3.T)
    dy2 = dz2 * relu_derivative(y2)
    dw2 = np.dot(z1.T, dy2)/m
    db2 = np.sum(dy2, axis=0)/m

    dz1 = np.dot(dy2, w2.T)
    dy1 = dz1 * sigmoid_derivative(z1)
    dw1 = np.dot(X.T, dy1)/m
    db1 = np.sum(dy1, axis=0)/m

    gradients = (dw1, db1, dw2, db2, dw3, db3)

    return gradients

# Source - https://towardsdatascience.com/backpropagation-from-scratch-how-neural-networks-really-work-36ee4af202bf

In [1057]:
def update_parameters(w1, b1, w2, b2, w3, b3, gradients, learning_rate):
#FUNCTION TO UPDATE PARAMETERS USING GD
    dW1, db1, dW2, db2, dW3, db3 = gradients
    w1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    w2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    w3 -= learning_rate * dW3
    b3 -= learning_rate * db3
    return w1, b1, w2, b2, w3, b3


In [1058]:
# use Gradient descent as of now as an optimizer
def gradient_descent(X, y, w1, b1, w2, b2, w3, b3, learning_rate, epochs):
    for i in range(epochs):
        z3, initial_values = forward(X, w1, b1, w2, b2, w3, b3)      # forward propagation.
        
        cost = cost_funct(y, z3)       # computing cost.
        
        gradients = backward(X, y, initial_values, w1, w2, w3)     # backward propagation.
        
        w1, b1, w2, b2, w3, b3 = update_parameters(w1, b1, w2, b2, w3, b3, gradients, learning_rate)  # updating weights and biases.

    return cost, w1, b1, w2, b2, w3, b3
    

In [1059]:
def model(X, y, W1, b1, W2, b2, W3, b3, learning_rate, epochs, cost):
    #function to train and build the whole model.
    for epoch in range(epochs):
        if epoch % 100 == 0:                      #printing cost after 100 epochs.
            print(f"Epoch {epoch}, Cost: {cost}")
          
    z3_pred, final_values = forward(X, W1, b1, W2, b2, W3, b3)

    y_pred = np.argmax(z3_pred,axis=1)          # can use indexes as our output is one hot encoded.
    y_true = np.argmax(y, axis=1)

    accuracy = np.mean(y_pred == y_true)
    F1_score = f1_score(y_true, y_pred, average='weighted')

    print("Accuracy :",round(accuracy*100, 2),"%")
    print("F1 Score (Weighted) :",round(F1_score, 2))
    
    return 

In [1060]:
W1, b1, W2, b2, W3, b3 = parameters(input_size, hidden_size1, hidden_size2, output_size)

In [1061]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [1062]:
#write down the predictions and the f1 score finally
learning_rate = 0.01
epochs = 2000  

cost, W1, b1, W2, b2, W3, b3 = gradient_descent(X_train, y_train, W1, b1, W2, b2, W3, b3, learning_rate, epochs)

In [1063]:
model(X_train, y_train, W1, b1, W2, b2, W3, b3, learning_rate, epochs, cost)   # training data accuracy

Epoch 0, Cost: 0.07285230288803324
Epoch 100, Cost: 0.07285230288803324
Epoch 200, Cost: 0.07285230288803324
Epoch 300, Cost: 0.07285230288803324
Epoch 400, Cost: 0.07285230288803324
Epoch 500, Cost: 0.07285230288803324
Epoch 600, Cost: 0.07285230288803324
Epoch 700, Cost: 0.07285230288803324
Epoch 800, Cost: 0.07285230288803324
Epoch 900, Cost: 0.07285230288803324
Epoch 1000, Cost: 0.07285230288803324
Epoch 1100, Cost: 0.07285230288803324
Epoch 1200, Cost: 0.07285230288803324
Epoch 1300, Cost: 0.07285230288803324
Epoch 1400, Cost: 0.07285230288803324
Epoch 1500, Cost: 0.07285230288803324
Epoch 1600, Cost: 0.07285230288803324
Epoch 1700, Cost: 0.07285230288803324
Epoch 1800, Cost: 0.07285230288803324
Epoch 1900, Cost: 0.07285230288803324
Accuracy : 98.33 %
F1 Score (Weighted) : 0.98


In [1064]:
model(X_test, y_test, W1, b1, W2, b2, W3, b3, learning_rate, epochs, cost)   # testing data accuracy


Epoch 0, Cost: 0.07285230288803324
Epoch 100, Cost: 0.07285230288803324
Epoch 200, Cost: 0.07285230288803324
Epoch 300, Cost: 0.07285230288803324
Epoch 400, Cost: 0.07285230288803324
Epoch 500, Cost: 0.07285230288803324
Epoch 600, Cost: 0.07285230288803324
Epoch 700, Cost: 0.07285230288803324
Epoch 800, Cost: 0.07285230288803324
Epoch 900, Cost: 0.07285230288803324
Epoch 1000, Cost: 0.07285230288803324
Epoch 1100, Cost: 0.07285230288803324
Epoch 1200, Cost: 0.07285230288803324
Epoch 1300, Cost: 0.07285230288803324
Epoch 1400, Cost: 0.07285230288803324
Epoch 1500, Cost: 0.07285230288803324
Epoch 1600, Cost: 0.07285230288803324
Epoch 1700, Cost: 0.07285230288803324
Epoch 1800, Cost: 0.07285230288803324
Epoch 1900, Cost: 0.07285230288803324
Accuracy : 100.0 %
F1 Score (Weighted) : 1.0
