# I am stupid enough to forget, so I repeat to remember

In [None]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

### 1. fetch mnist dataset

In [None]:
def fetch_mnist():
    
    ## load libs
    import os
    from six.moves import urllib
    from scipy.io import loadmat

    ## make dir
    if not os.path.exists('datasets'):
        os.mkdir('datasets')
    
    ## download mnist
    mnist_path = "./datasets/mnist-original.mat"
    mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
    response = urllib.request.urlopen(mnist_alternative_url)
    if not os.path.exists(mnist_path):
        with open(mnist_path, 'wb') as f:
            content = response.read()
            f.write(content)
            
    ## load mnist
    mnist_raw = loadmat(mnist_path)
    mnist = {'data': mnist_raw['data'].T,
             'target': mnist_raw['label'][0],
             'COL_NAMES' : ['label', 'data'],
             'DESCR': 'mldata.org dataset: mnist-original'}
    
    ## return mnist
    return mnist, mnist['data'], mnist['target']


### 2. Load and preprocess

In [None]:
## load data
print('a. Loading....: ', end = '')
mnist, X, y = fetch_mnist()
print('Original data shape', X.shape, ', target shape', y.shape)

## reshape
print('b. Reshaping..: ', end = '')
X = X.T.reshape(28 * 28, -1)
y = y.reshape(1,-1)
print('Re-shaped data shape:', X.shape, ', target shape', y.shape)

## check data range
print('c. Scaling....: ', end = '')
print('Scale Data to [0, 1]:')
print('\t\tOriginal Range: Xmax-%d, Xmin-%d, ymax-%d, y-min-%d' % (X.max(), X.min(), y.max(), y.min()))
X = X/255
print('\t\tScaled Range: Xmax-%d, Xmin-%d, ymax-%d, y-min-%d' % (X.max(), X.min(), y.max(), y.min()))

### 3. Prepare Input

In [None]:
from sklearn.preprocessing import OneHotEncoder
X = X
Y = OneHotEncoder().fit_transform(y.reshape(-1,1).astype('int8')).toarray().T
X.shape, Y.shape

### 4. Make train/test splits

In [None]:
m = 60000
X_train, Y_train = X[:,:m], Y[:,:m]
X_test , Y_test  = X[:,m:], Y[:,m:]
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

### 5. Shuffle train set

In [None]:
shuffle = np.random.permutation(m)
X_train, Y_train = X_train[:,shuffle], Y_train[:,shuffle]
X_train.shape, Y_train.shape

### 6. Visual check

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
seed = 123456
np.random.seed(seed)
idx = np.random.randint(m)

## visualize
image = X_train[:,idx].squeeze().reshape(28, 28)
label = np.argmax(Y_train[:,idx])

plt.figure(figsize = (5, 5))
plt.imshow(image, 'binary_r')
plt.title('Visual Check: image index %d, label %d'%(idx, label))
plt.show()

### 7. Config MLP

In [None]:
## params
num_samples = 60000
batch_size  = 3000
input_dims  = 784
hidden_dims = 32
output_dims = 10
lr = 0.01
epoches = 1000
display_step = 100

In [None]:
## initialize network as random weights, and bias as zeros
W1 = np.random.randn(hidden_dims, input_dims)
b1 = np.zeros((hidden_dims, 1))
W2 = np.random.randn(output_dims, hidden_dims)
b2 = np.zeros((output_dims, 1))

### 8. Make/Train Net

In [None]:
## train 100 epoches
for epoch in range(epoches):
    
    ## batching
    samples = np.random.choice(num_samples, batch_size, replace=False)
    X_batch = X_train[:, samples]
    Y_batch = Y_train[:, samples]
    
    ## Forward
    Z1 = W1 @ X_batch + b1 ## matrix multiplication: (32, 784) @ (784, 60000) 
    A1 = 1 / (1 + np.exp(-Z1)) ## sigmoid: 1/(1+np.exp(-x)).sum(axis = 1), (32, 60000)
    Z2 = W2 @ A1 + b2 ## (10, 32) @ (32, 60000) ==> (10, 60000)
    A2 = np.exp(Z2)/ np.exp(Z2).sum(axis = 0) ## softmax: (10, 60000)
    
    ## compute cross-entropy loss
    ## loss = -sum(y_true * log(y_pred))/60000
    Loss = -np.sum(Y_batch * np.log(A2)) / batch_size
    
    ## Compute gradient
    dZ2 = A2 - Y_batch ## ce + softmax (10, 60000)
    dW2 = dZ2 @ A1.T ## (10, 32) ==> (10, 60000) @ (60000, 32)
    db2 = dZ2.sum(axis = 1, keepdims = True) / batch_size
    
    dA1 = W2.T @ dZ2 ## (32, 60000)
    dZ1 = dA1 * A1 * (1 - A1)
    dW1 = dZ1 @ X_batch.T
    db1 = dZ1.sum(axis = 1, keepdims = True) / batch_size
    
    ## Backward
    W1 -= dW1 * lr
    W2 -= dW2 * lr
    b1 -= db1 * lr
    b2 -= db2 * lr
    
    if epoch % display_step == 0:
        print('Epoch %2d, Loss %.4f'%(epoch, Loss))

### 9. Test Network

In [None]:
## forward pass
Z1 = W1 @ X_test + b1
A1 = 1 / (1 + np.exp(-Z1))
Z2 = W2 @ A1 + b2
A2 = np.exp(Z2) / np.exp(Z2).sum(axis = 0)

## generate results
preds = np.argmax(A2, axis = 0)
truth = np.argmax(Y_test, axis = 0)


### 10. Calculate performance

In [None]:
## import sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder

In [None]:
confusion_matrix(truth, preds)

In [None]:
print(classification_report(truth, preds))

In [None]:
print(accuracy_score(truth, preds))