In [1]:
from sklearn.datasets import fetch_openml
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

In [2]:
# Fetching the MNIST dataset
mnist = fetch_openml('mnist_784')
X = mnist.data 
y = mnist.target

# scaling 
X = X/255

# one-hot encode labels
digits = 10
examples = y.shape[0]
y = y.reshape(1, examples)
Y_new = np.eye(digits)[y.astype('int32')]
Y_new = Y_new.T.reshape(digits, examples)

# Preparing the dataset by split, reshape in train and test
m = 60000
m_test = X.shape[0] - m
X_train, X_test = X[:m].T, X[m:].T
Y_train, Y_test = Y_new[:,:m], Y_new[:,m:]

# shuffing training set
shuffle_index = np.random.permutation(m)
X_train, Y_train = X_train[:, shuffle_index], Y_train[:, shuffle_index]

In [3]:
# Standard sigmoid function for activation and cross-entropy function for loss

def sigmoid(z):
    s = 1. / (1. + np.exp(-z))
    return s

def compute_loss(Y, Y_hat):

    L_sum = np.sum(np.multiply(Y, np.log(Y_hat)))
    m = Y.shape[1]
    L = -(1./m) * L_sum

    return L

In [4]:
# Declaring Feed forward neural network

def feed_forward(X, params):

    cache = {}

#     Z = (W.X)+b

#     Softmax: F(x) = exp(x[i])/sum(exp(x[j]))
    
    cache["Z1"] = np.matmul(params["W1"], X) + params["b1"]
    cache["A1"] = sigmoid(cache["Z1"])
    cache["Z2"] = np.matmul(params["W2"], cache["A1"]) + params["b2"]
    cache["A2"] = np.exp(cache["Z2"]) / np.sum(np.exp(cache["Z2"]), axis=0)

    return cache

In [5]:
# Declaring Back propogation using mini batches with Stochastic gradient descent(SGD)

def back_propagate(X, Y, params, cache):

#     Error handling
    dZ2 = cache["A2"] - Y
    
#     gradients at last layer
    dW2 = (1./m_batch) * np.matmul(dZ2, cache["A1"].T)
    db2 = (1./m_batch) * np.sum(dZ2, axis=1, keepdims=True)

#     Backprop in 1st layer
    dA1 = np.matmul(params["W2"].T, dZ2)
    dZ1 = dA1 * sigmoid(cache["Z1"]) * (1 - sigmoid(cache["Z1"]))
    
#     gradients at 1st layer
    dW1 = (1./m_batch) * np.matmul(dZ1, X.T)
    db1 = (1./m_batch) * np.sum(dZ1, axis=1, keepdims=True)

    grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}

    return grads

In [9]:
# params["W1"] = params["W1"] - learning_rate * grads["dW1"]

# Both are same, but below beta is the SGD with momentum

# V_dW1 = (beta * V_dW1 + (1. - beta) * grads["dW1"])
# params["W1"] = params["W1"] - learning_rate * V_dW1

np.random.seed(138)

# hyperparameters
n_x = X_train.shape[0]
n_h = 64
learning_rate = 0.5
beta = .9  # momentum
batch_size = 128
epochs = 20
batches = -(-m // batch_size)

# Initialization
params = { "W1": np.random.randn(n_h, n_x) * np.sqrt(1. / n_x),
           "b1": np.zeros((n_h, 1)) * np.sqrt(1. / n_x),
           "W2": np.random.randn(digits, n_h) * np.sqrt(1. / n_h),
           "b2": np.zeros((digits, 1)) * np.sqrt(1. / n_h) }

V_dW1 = np.zeros(params["W1"].shape)
V_db1 = np.zeros(params["b1"].shape)
V_dW2 = np.zeros(params["W2"].shape)
V_db2 = np.zeros(params["b2"].shape)

In [10]:
# Network Training
for i in range(epochs):

#     shuffling training set
    permutation = np.random.permutation(X_train.shape[1])
    X_train_shuffled = X_train[:, permutation]
    Y_train_shuffled = Y_train[:, permutation]

    for j in range(batches):

#         randomly selected minibatch
        begin = j * batch_size
        end = min(begin + batch_size, X_train.shape[1] - 1)
        X = X_train_shuffled[:, begin:end]
        Y = Y_train_shuffled[:, begin:end]
        m_batch = end - begin

#         initializing forward and backward props
        cache = feed_forward(X, params)
        grads = back_propagate(X, Y, params, cache)

#         grads with momentum
        V_dW1 = (beta * V_dW1 + (1. - beta) * grads["dW1"])
        V_db1 = (beta * V_db1 + (1. - beta) * grads["db1"])
        V_dW2 = (beta * V_dW2 + (1. - beta) * grads["dW2"])
        V_db2 = (beta * V_db2 + (1. - beta) * grads["db2"])

#         SGD
        params["W1"] = params["W1"] - learning_rate * V_dW1
        params["b1"] = params["b1"] - learning_rate * V_db1
        params["W2"] = params["W2"] - learning_rate * V_dW2
        params["b2"] = params["b2"] - learning_rate * V_db2

#     forward pass on training set
    cache = feed_forward(X_train, params)
    train_loss = compute_loss(Y_train, cache["A2"])
    
#     forward pass on test set
    cache = feed_forward(X_test, params)
    test_loss = compute_loss(Y_test, cache["A2"])
    print("Epoch {}: training cost = {}, test cost = {}".format(i+1 ,train_loss, test_loss))

print("Done.")

Epoch 1: training cost = 0.3084223702853005, test cost = 0.2978109564364632
Epoch 2: training cost = 0.24327463082529338, test cost = 0.23978787281031289
Epoch 3: training cost = 0.209208451990476, test cost = 0.2106049360076865
Epoch 4: training cost = 0.18810927475568814, test cost = 0.19237124789096477
Epoch 5: training cost = 0.16608990732388398, test cost = 0.17274985092920264
Epoch 6: training cost = 0.1451922797303129, test cost = 0.15400834945856123
Epoch 7: training cost = 0.1323464361016679, test cost = 0.1433670465491995
Epoch 8: training cost = 0.12078889104633907, test cost = 0.13309185842906127
Epoch 9: training cost = 0.11168678291167111, test cost = 0.12498490972292169
Epoch 10: training cost = 0.10604296311955042, test cost = 0.12035072951968126
Epoch 11: training cost = 0.09677985477245561, test cost = 0.11402747997322173
Epoch 12: training cost = 0.09124800813217832, test cost = 0.11088962064979685
Epoch 13: training cost = 0.08627200664290781, test cost = 0.10753049

In [13]:
cache = feed_forward(X_test, params)
predictions = np.argmax(cache["A2"], axis=0)
labels = np.argmax(Y_test, axis=0)

print(classification_report(predictions, labels))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       997
           1       0.99      0.99      0.99      1136
           2       0.97      0.97      0.97      1036
           3       0.98      0.97      0.97      1019
           4       0.97      0.97      0.97       984
           5       0.95      0.98      0.96       870
           6       0.97      0.97      0.97       958
           7       0.97      0.97      0.97      1029
           8       0.97      0.96      0.97       980
           9       0.95      0.97      0.96       991

    accuracy                           0.97     10000
   macro avg       0.97      0.97      0.97     10000
weighted avg       0.97      0.97      0.97     10000

