In [48]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

data = pd.read_csv('./train.csv')
data = np.array(data)
m, n = data.shape
np.random.shuffle(data) # shuffle before splitting into dev and training sets

data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:n]
X_dev = X_dev / 255.

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255.
_,m_train = X_train.shape


In [49]:
def init_params():
    W1 = np.random.rand(64, 784) - 0.5  # First hidden layer with 64 neurons
    b1 = np.random.rand(64, 1) - 0.5
    
    W2 = np.random.rand(32, 64) - 0.5   # Second hidden layer with 32 neurons
    b2 = np.random.rand(32, 1) - 0.5

    W3 = np.random.rand(16, 32) - 0.5   # Third hidden layer with 16 neurons
    b3 = np.random.rand(16, 1) - 0.5
    
    W4 = np.random.rand(10, 16) - 0.5    # Output layer with 10 neurons
    b4 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2, W3, b3, W4, b4


In [50]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [51]:
def ReLU(Z):
    return np.maximum(Z, 0)

In [52]:
def softmax(Z):
    A = np.exp(Z) / np.sum(np.exp(Z), axis=0)
    return A


In [53]:
# Forward propagation
def forward_prop(W1, b1, W2, b2,W3,b3,W4,b4, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = sigmoid(Z2)
    Z3 = W3.dot(A2) + b3
    A3 = ReLU(Z3)
    Z4 = W4.dot(A3) + b4
    A4 = softmax(Z4)
    return Z1, A1, Z2, A2,Z3,A3,Z4,A4


In [54]:
# Backward propagation
def ReLU_deriv(Z):
    return Z > 0


In [55]:
def sigmoid_deriv(z):
    sig = sigmoid(z)
    return sig * (1 - sig)

In [56]:
def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    return one_hot_Y.T

In [57]:
def backward_prop(Z1, A1, Z2, A2, Z3, A3, Z4, A4, W1, W2, W3, W4, X, Y):
    one_hot_Y = one_hot(Y)
    m = X.shape[1]
    
    # Layer 4 (Output)
    dZ4 = A4 - one_hot_Y
    dW4 = 1 / m * dZ4.dot(A3.T)
    db4 = 1 / m * np.sum(dZ4, axis=1, keepdims=True)

    # Layer 3
    dZ3 = W4.T.dot(dZ4) * ReLU_deriv(Z3)
    dW3 = 1 / m * dZ3.dot(A2.T)
    db3 = 1 / m * np.sum(dZ3, axis=1, keepdims=True)

    # Layer 2
    dZ2 = W3.T.dot(dZ3) * sigmoid_deriv(Z2)
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2, axis=1, keepdims=True)

    # Layer 1
    dZ1 = W2.T.dot(dZ2) * ReLU_deriv(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1, axis=1, keepdims=True)

    return dW1, db1, dW2, db2, dW3, db3, dW4, db4


In [58]:
# Parameter updates
def update_params(W1, b1, W2, b2, W3, b3, W4, b4, 
                  dW1, db1, dW2, db2, dW3, db3, dW4, db4, 
                  alpha):
    W1 -= alpha * dW1
    b1 -= alpha * db1
    W2 -= alpha * dW2
    b2 -= alpha * db2
    W3 -= alpha * dW3
    b3 -= alpha * db3
    W4 -= alpha * dW4
    b4 -= alpha * db4
    return W1, b1, W2, b2, W3, b3, W4, b4


In [59]:
# Prediction and accuracy
def get_predictions(A4):
    return np.argmax(A4, axis=0)

In [60]:
def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

In [61]:
# Training the model
def gradient_descent(X, Y, alpha, iterations):
    W1, b1, W2, b2,W3,b3,W4,b4 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2,Z3,A3,Z4,A4 = forward_prop(W1, b1, W2, b2,W3,b3,W4,b4, X)
        dW1, db1, dW2, db2,dW3,db3,dW4,db4 = backward_prop(Z1, A1, Z2, A2,Z3,A3,Z4,A4, W1, W2,W3,W4,X, Y)
        W1, b1, W2, b2,W3,b3,W4,b4 = update_params(W1, b1, W2, b2,W3,b3,W4,b4,dW1, db1, dW2, db2,dW3,db3,dW4,db4,alpha)
        if i % 10 == 0:
            predictions = get_predictions(A4)
            accuracy = get_accuracy(predictions, Y)
            print(f"Iteration {i}, Accuracy: {accuracy * 100:.2f}%")
    return W1, b1, W2, b2,W3,b3,W4,b4

In [79]:
# Running the model
W1, b1, W2, b2,W3,b3,W4,b4 = gradient_descent(X_train, Y_train, 0.25, 2000)

Iteration 0, Accuracy: 9.89%
Iteration 10, Accuracy: 31.06%
Iteration 20, Accuracy: 41.70%
Iteration 30, Accuracy: 49.56%
Iteration 40, Accuracy: 56.65%
Iteration 50, Accuracy: 62.19%
Iteration 60, Accuracy: 67.27%
Iteration 70, Accuracy: 70.83%
Iteration 80, Accuracy: 73.79%
Iteration 90, Accuracy: 75.82%
Iteration 100, Accuracy: 77.49%
Iteration 110, Accuracy: 78.86%
Iteration 120, Accuracy: 80.05%
Iteration 130, Accuracy: 81.05%
Iteration 140, Accuracy: 81.97%
Iteration 150, Accuracy: 82.79%
Iteration 160, Accuracy: 83.49%
Iteration 170, Accuracy: 84.04%
Iteration 180, Accuracy: 84.55%
Iteration 190, Accuracy: 84.95%
Iteration 200, Accuracy: 85.39%
Iteration 210, Accuracy: 85.79%
Iteration 220, Accuracy: 86.17%
Iteration 230, Accuracy: 86.55%
Iteration 240, Accuracy: 86.80%
Iteration 250, Accuracy: 87.10%
Iteration 260, Accuracy: 87.38%
Iteration 270, Accuracy: 87.62%
Iteration 280, Accuracy: 87.87%
Iteration 290, Accuracy: 88.11%
Iteration 300, Accuracy: 88.29%
Iteration 310, Accur

In [80]:
def make_predictions(X, W1, b1, W2, b2, W3, b3, W4, b4):
    # Perform forward propagation
    _, _, _, _, _, _, _, A4 = forward_prop(W1, b1, W2, b2, W3, b3, W4, b4, X)
    predictions = get_predictions(A4)  # Get predicted class labels
    return predictions


In [81]:
def get_dev_accuracy(X_dev, Y_dev, W1, b1, W2, b2, W3, b3, W4, b4):
    dev_predictions = make_predictions(X_dev, W1, b1, W2, b2, W3, b3, W4, b4)
    accuracy = get_accuracy(dev_predictions, Y_dev)
    print(f"Dev set accuracy: {accuracy * 100:.2f}%")  # Display accuracy in percentage

In [82]:
# Get accuracy on the dev set
get_dev_accuracy(X_dev, Y_dev, W1, b1, W2, b2, W3, b3, W4, b4)

Dev set accuracy: 94.40%
