In [63]:
"""
1. Pre-Process the dataset
2. Defining the architectiure 
3. Initialization
4. Forward prop 
5. Compute loss 
6. Backward prop
7. Update parameters
8. Repeat steps 4-7
9. Predict


Architecture:
4-4-1
"""

print("Layout for this notebook")

Layout for this notebook


In [64]:
import numpy as np
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler


OHE = OneHotEncoder(sparse_output=False)
scaler = StandardScaler()

Loading the dataset

In [65]:
# fetch dataset 
iris = fetch_ucirepo(id=53) 
  
# data (as pandas dataframes) 
X = iris.data.features 
y = iris.data.targets 
  
# metadata 
# print(iris.metadata) 
  
# variable information 
# print(iris.variables) 

Pre-Processing the dataset

In [66]:
"""
1. Normalize X
2. One hot encode y
3. Create the train and test split
"""

'\n1. Normalize X\n2. One hot encode y\n3. Create the train and test split\n'

In [67]:
# 1. Normalize X

X = scaler.fit_transform(X)
X[:5]

array([[-0.90068117,  1.03205722, -1.3412724 , -1.31297673],
       [-1.14301691, -0.1249576 , -1.3412724 , -1.31297673],
       [-1.38535265,  0.33784833, -1.39813811, -1.31297673],
       [-1.50652052,  0.10644536, -1.2844067 , -1.31297673],
       [-1.02184904,  1.26346019, -1.3412724 , -1.31297673]])

In [68]:
# 2. One hot encode y

y = np.array(y['class'])
y = y.reshape(-1, 1)
y = OHE.fit_transform(y)

In [69]:
# Train-test split
# 80-20

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Helper Functions

In [70]:
# 1. Defining the architecture

n_x = 4 # no. of input variables
n_h = 4 # no. of hidden units
n_y = 1 # no. of output units

In [71]:
# 2. Initialization

def initialize_parameters(n_x, n_h, n_y):
    W1 = np.random.randn(n_h, n_x) * 0.01
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.zeros((n_y, 1))
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    
    return parameters

In [72]:
# Define the softmax function

def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [73]:
# 3. Forward prop

def forward_propagation(X, parameters):
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    
    Z1 = np.dot(W1,X) + b1
    A1 = np.tanh(Z1)
    Z2 = np.dot(W2, A1) + b2
    A2 = softmax(Z2)  
    
    cache = {"Z1": Z1,
             "A1": A1,
             "Z2": Z2,
             "A2": A2}
    
    return A2, cache

In [74]:
# 4. Compute cost

def compute_cost(Y, A2):
    m = Y.shape[1]
    
    # print("Y shape =", Y.shape)
    # print("A2 shape =", A2.T.shape)
    # sample_product = np.dot(Y, np.log(A2).T)
    # print("Shape after dot product =", np.sum(sample_product, axis=0))
    # print("Regular multiplication =", np.sum(np.multiply(Y, np.log(A2))))
    # print("Shape of regular product", np.sum(np.multiply(Y, np.log(A2))))
    cost = -(1/m) * np.sum(np.sum(np.multiply(Y, np.log(A2))))
    # cost = -(1/m) * np.sum(np.dot(Y, np.log(A2).T))
    
    # cost = float(np.squeeze(cost))
    
    # Manually calculate a sample to see why cost can't be calculated using dot product
    
    return cost

In [75]:
# 5. Backward prop

def backward_propagation(parameters, cache, X, Y):
    W1 = parameters['W1']
    W2 = parameters['W2']
    
    A1 = cache['A1']
    A2 = cache['A2']
    
    m = Y.shape[1]
    
    dZ2 = A2 - Y
    dW2 = 1/m * np.dot(dZ2, A1.T)
    db2 = 1/m * np.sum(dZ2, axis=1, keepdims=True)
    dZ1 = np.dot(W2.T, dZ2) * (1 - np.power(A1, 2))
    dW1 = 1/m * np.dot(dZ1, X.T)
    db1 = 1/m * np.sum(dZ1, axis=1, keepdims=True)
    
    grads = {"dW1": dW1,
             "db1": db1,
             "dW2": dW2,
             "db2": db2}
    
    return grads  
    

In [76]:
# 6. Update parameters

def update_parameters(parameters, grads, learning_rate):
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    b1 = parameters["b1"]
    b2 = parameters["b2"]
    
    dW1 = grads["dW1"]
    dW2 = grads["dW2"]
    db1 = grads["db1"]
    db2 = grads["db2"]
    
    W1 = W1 - learning_rate * dW1
    b1 = b1 - learning_rate * db1
    W2 = W2 - learning_rate * dW2
    b2 = b2 - learning_rate * db2
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters

Integrating everything

In [77]:
# 7. Repeat steps 3-6

def nn_model(X, Y, n_h, learning_rate, num_iterations = 10000, print_cost=False):
    n_x = 4
    n_y = 3
    
    parameters = initialize_parameters(n_x, n_h, n_y)
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    b1 = parameters["b1"]
    b2 = parameters["b2"]  
    
    for i in range(0, num_iterations+1):
        A2, cache = forward_propagation(X, parameters)
        cost = compute_cost(Y, A2)
        grads = backward_propagation(parameters, cache, X, Y)
        parameters = update_parameters(parameters, grads, learning_rate)
        
        
        if print_cost and i % 1000 == 0:
            print("The cost at iteration %i: %f" %(i, cost))
    
    return parameters

In [123]:
parameters = nn_model(X_train.T, y_train.T, 4, 1.3, num_iterations=10000, print_cost=True)
print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))
print("W2 = " + str(parameters["W2"]))
print("b2 = " + str(parameters["b2"]))

The cost at iteration 0: 1.098627
The cost at iteration 1000: 0.042644
The cost at iteration 2000: 0.037531
The cost at iteration 3000: 0.033430
The cost at iteration 4000: 0.029452
The cost at iteration 5000: 0.027663
The cost at iteration 6000: 0.026579
The cost at iteration 7000: 0.025805
The cost at iteration 8000: 0.025208
The cost at iteration 9000: 0.024726
The cost at iteration 10000: 0.024327
W1 = [[-1.42387912 -0.68658443  0.73778788 -6.82272974]
 [-2.265572    0.05452731  7.98809635  0.53323523]
 [-0.77271216  0.6228368  -1.50330811 -1.32618962]
 [-1.14269875 -5.46487212  3.92966822  1.48732248]]
b1 = [[ 5.42180995]
 [-3.55977204]
 [-1.33000135]
 [-3.30238793]]
W2 = [[ 1.89846014 -2.4741868   5.51984091 -3.06133006]
 [ 3.96673306 -2.88455483 -4.41426539 -2.11828805]
 [-5.87975426  5.36308956 -1.14012711  5.18148841]]
b2 = [[ 0.7523491 ]
 [ 0.02336949]
 [-0.77571859]]


Prediction & Post training analysis

In [103]:
# 8. Predict

def predict(parameters, X, y):
    # Get predictions by using the final weights on a forward pass
    A2, cache = forward_propagation(X, parameters)
    predictions = np.argmax(A2, axis=0) 

    # Get the true values
    y_values = np.argmax(y, axis=1)
    
    # Check how many of them are same and compute the average 
    accuracy = np.mean(predictions == y_values) * 100
    
    print(f"{accuracy:.2f}% accuracy")

In [124]:
predict(parameters, X_train.T, y_train)

99.17% accuracy


In [125]:
predict(parameters, X_test.T, y_test)

100.00% accuracy
