In [59]:
## repeat for MLP

In [60]:
%matplotlib inline
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder

In [61]:
# load mnist
mnist = fetch_openml('mnist_784')
X, y = mnist.data, mnist.target
X.shape, y.shape

((70000, 784), (70000,))

In [62]:
# preprocessing
X = X.T / 255.0
y = OneHotEncoder().fit_transform(y.astype('int32').reshape(-1,1)).toarray().T
X.shape, y.shape

((784, 70000), (10, 70000))

In [63]:
# make train/test split
m = 60000
X_train, X_test = X[:,:m], X[:,m:]
y_train, y_test = y[:,:m], y[:,m:]

# shuffle
seed = 123456
np.random.seed(seed)
shuffle = np.random.permutation(m)
X_train, y_train = X_train[:, shuffle], y_train[:,shuffle]
X_train.shape, y_train.shape

((784, 60000), (10, 60000))

In [64]:
# build nlp
n_samples = 60000
input_dims = 784
hidden_dims = 64
output_dims = 10

# weights/bias
W1 = np.random.randn(hidden_dims, input_dims)
b1 = np.zeros((hidden_dims, 1))
W2 = np.random.randn(output_dims, hidden_dims)
b2 = np.zeros((output_dims, 1))
lr = 1

In [120]:
# training
lr = 0.1
for ep in range(1000):

    # forward pass
    Z1 = W1 @ X_train + b1 # (128, 784) @ (784, 60000) + (128, 1)
    A1 = 1 / (1 + np.exp(-Z1)) # sigmoid: 128 * 60000
    Z2 = W2 @ A1 + b2 # (10, 32) @ (128, 60000) + (10, 1)
    A2 = np.exp(Z2) / np.exp(Z2).sum(axis = 0) # 10 * 60000, prob for each class

    # calculate loss
    L  = -np.sum(y_train * np.log(A2))/n_samples # scaler
    
    # backward pass
    dZ2 = A2 - y_train # 10 * 60000, dL/dZ2 = Y_hat - Y (square-error-like)
    dW2 = dZ2 @ A1.T / n_samples # (10,60000) @ (10, 128).T / 60000
    db2 = dZ2.sum(axis = 1, keepdims = True)/n_samples # (10, 1) <== (10, 60000).sum(axis = 1, keepdims= True)

    dA1 = W2.T @ dZ2 # (10 * 784).T @ (10, 60000) ==> (784, 60000)
    dZ1 = dA1 * A1 * (1 - A1) # d_sigmoid
    dW1 = dZ1 @ X_train.T / n_samples
    db1 = dZ1.sum(axis=1, keepdims = True)/n_samples

    # update W/b
    W1 -= lr * dW1
    W2 -= lr * dW2
    b1 -= lr * db1
    b2 -= lr * db2
    
    # print
    print('\nThe loss at epoch #%2d is %2.4f'%(ep, L) if ep%100 == 0 else '', end = ' ')
    
# test
Z1 = W1 @ X_test + b1
A1 = 1 / (1 + np.exp(-Z1))
Z2 = W2 @ A1 + b2
Z2 = Z2 - Z2.sum(axis = 0)
A2 = np.exp(Z2)
A2 = A2/A2.sum(axis = 0)

# results
preds = np.argmax(A2, axis = 0)
truth = np.argmax(y_test, axis = 0)

print('\n\nclassification_report')
print(classification_report(truth, preds))
print('\n\naccuracy_score')
print(accuracy_score(truth, preds))
print('\n\nconfusion_matrix')
print(confusion_matrix(truth, preds))



The loss at epoch # 0 is 0.0526                                                                                                    
The loss at epoch #100 is 0.0525                                                                                                    
The loss at epoch #200 is 0.0525                                                                                                    
The loss at epoch #300 is 0.0525                                                                                                    
The loss at epoch #400 is 0.0525                                                                                                    
The loss at epoch #500 is 0.0525                                                                                                    
The loss at epoch #600 is 0.0524                                                                                                    
The loss at epoch #700 is 0.0524                                     