In [112]:
import os
import numpy as np
import pandas as pd

In [113]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
# Get the data and labels
X, y = mnist["data"], mnist["target"]
# Convert the labels to integers
y = y.astype(np.int8)
# Normalize the data (pixel values between 0 and 1)
X = X / 255.0

In [114]:
X_train, X_test = np.array(X[:60000]).T, np.array(X[60000:]).T
y_train, y_test = np.array(y[:60000]), np.array(y[60000:])

n,m=X_train.shape

print(f"Training data shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Testing data shape: {X_test.shape}")
print(f"Testing labels shape: {y_test.shape}")

Training data shape: (784, 60000)
Training labels shape: (60000,)
Testing data shape: (784, 10000)
Testing labels shape: (10000,)


In [118]:
def init_params():
    # He initialization for ReLU
    w1 = np.random.randn(16, 784) * np.sqrt(2. / 784)
    b1 = np.zeros((16, 1))
    w2 = np.random.randn(16, 16) * np.sqrt(2. / 16)
    b2 = np.zeros((16, 1))
    w3 = np.random.randn(10, 16) * np.sqrt(2. / 16)
    b3 = np.zeros((10, 1))
    return w1, b1, w2, b2, w3, b3

def load_params():
    try:
        with open('mnist_trained_params.pkl', 'rb') as f:
            params = pickle.load(f)
            w1, b1 = params['w1'], params['b1']
            w2, b2 = params['w2'], params['b2']
            w3, b3 = params['w3'], params['b3']
            print("Parameters loaded from mnist_trained_params.pkl")
    except:
        print("mnist_trained_params.pkl not found. Initializing new parameters.")
        w1, b1, w2, b2, w3, b3 = init_params()
    return w1, b1, w2, b2, w3, b3

def ReLu(x):
    return np.maximum(0,x)

def d_ReLu(x):
    return x>0

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def softmax(Z):
    Z -= np.max(Z, axis=0)  # Subtract max value for numerical stability
    A = np.exp(Z) / np.sum(np.exp(Z), axis=0)
    return A

def get_predictions(A):
    return np.argmax(A, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

In [119]:
def forward(w1,b1,w2,b2,w3,b3,x):
    z1 = w1.dot(x) + b1
    a1 = ReLu(z1)
    z2 = w2.dot(a1) + b2
    a2 = ReLu(z2)
    z3 = w3.dot(a2) + b3
    a3 = softmax(z3)
    return z1, a1, z2, a2, z3, a3

def update_params(w1, b1, w2, b2, w3, b3, dw1, db1, dw2, db2, dw3, db3, alpha):
    w1 = w1 - alpha * dw1
    b1 = b1 - alpha * db1    
    w2 = w2 - alpha * dw2  
    b2 = b2 - alpha * db2
    w3 = w3 - alpha * dw3  
    b3 = b3 - alpha * db3
    return w1, b1, w2, b2, w3, b3

def backpropagation(z1, a1, z2, a2, z3, a3, w1, w2, w3, x, y):
    # Output layer gradients
    dL_z3 = a3 - y
    dL_w3 = 1/m * dL_z3.dot(a2.T)
    dL_b3 = 1/m * np.sum(dL_z3, axis=1, keepdims=True)

    # 2nd hidden layer gradients
    dL_z2 = w3.T.dot(dL_z3) * d_ReLu(z2)
    dL_w2 = 1/m * dL_z2.dot(a1.T)
    dL_b2 = 1/m * np.sum(dL_z2, axis=1, keepdims=True)

    # 1st hidden layer gradients
    dL_z1 = w2.T.dot(dL_z2) * d_ReLu(z1)
    dL_w1 = 1/m * dL_z1.dot(x.T)
    dL_b1 = 1/m * np.sum(dL_z1, axis=1, keepdims=True)

    return dL_w1, dL_b1, dL_w2, dL_b2, dL_w3, dL_b3 

def gradient_descent(X, Y, alpha, iterations):
    OHY = one_hot(Y)
    w1, b1, w2, b2, w3, b3 = load_params()
    for i in range(iterations):
        z1, a1, z2, a2, z3, a3 = forward(w1, b1, w2, b2, w3, b3, X)
        dw1, db1, dw2, db2, dw3, db3 = backpropagation(z1, a1, z2, a2, z3, a3, w1, w2, w3, X, OHY)
        w1, b1, w2, b2, w3, b3 = update_params(w1, b1, w2, b2, w3, b3, dw1, db1, dw2, db2, dw3, db3, alpha)
        if i % 100 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(a3)
            print("Accuracy: ", get_accuracy(predictions, Y))
    return w1, b1, w2, b2, w3, b3


In [120]:
w1, b1, w2, b2, w3, b3 = gradient_descent(X_train, y_train, 0.001, 500)

Parameters loaded from mnist_trained_params.pkl
Iteration:  0
[5 0 4 ... 5 6 8] [5 0 4 ... 5 6 8]
Accuracy:  0.9695833333333334
Iteration:  100
[5 0 4 ... 5 6 8] [5 0 4 ... 5 6 8]
Accuracy:  0.9696166666666667
Iteration:  200
[5 0 4 ... 5 6 8] [5 0 4 ... 5 6 8]
Accuracy:  0.9696166666666667
Iteration:  300
[5 0 4 ... 5 6 8] [5 0 4 ... 5 6 8]
Accuracy:  0.9696166666666667
Iteration:  400
[5 0 4 ... 5 6 8] [5 0 4 ... 5 6 8]
Accuracy:  0.9696166666666667
