In [1]:
import pandas as pd
import numpy as np

In [2]:
cols = (list(range(-1, 784)))
def prepare_ds(path, split=False, ratio=0.0):
    data = pd.read_csv(filepath_or_buffer=path, names=cols)
    data = data.sample(frac=1)
    if split == True:    
        d1X = np.array(data[:int(-data.shape[0] * ratio)][cols[1:]]).T
        d1Y = np.array(pd.get_dummies(data[:int(-data.shape[0] * ratio)][-1])[:]).T
        d2X = np.array(data[int(-data.shape[0] * ratio):][cols[1:]]).T
        d2Y = np.array(pd.get_dummies(data[int(-data.shape[0] * ratio):][-1])[:]).T
        return d1X, d1Y, d2X, d2Y
    if split == False:
        d1X = np.array(data[:][cols[1:]]).T
        d1Y = np.array(pd.get_dummies(data[:][-1])[:]).T
        return d1X, d1Y

In [3]:
data_path = '../digit-recognizer/mnist_train.csv'
test_path = '../digit-recognizer/mnist_test.csv'
train_X, train_Y, valid_X, valid_Y = prepare_ds(data_path, split=True, ratio=0.2)
test_X, test_Y, = prepare_ds(test_path)

In [4]:
m_train = train_X.shape[1]
m_test = valid_X.shape[1]
train_X.shape, train_Y.shape, valid_X.shape, valid_Y.shape

((784, 48000), (10, 48000), (784, 12000), (10, 12000))

In [5]:
def normalize(X, Y):
    X = (X - np.mean(X)) / (np.std(X))
    return X, Y

In [6]:
def compute_LogLoss(Yp, Yg):
    cost = -(Yg * np.log(Yp) + (1 - Yg) * np.log(1 - Yp))
    return np.mean(cost)

In [7]:
def forward(W, b, X, eval=False):
    Z = np.dot(W.T, X) + b
    A = 1. / (1 + np.exp(-Z))
    if eval == False:
        return A
    if eval == True:
        return np.argmax(A, axis=0)

In [16]:
def backward_adam(Yp, Yg, W, b, X, VdW_prev, Vdb_prev, SdW_prev, Sdb_prev, learning_rate=0.001, beta=0.99):
    cost = compute_LogLoss(Yp.T, Yg.T)
    diff = Yp - Yg
    dW = np.dot(X, diff.T) / m_train
    db = np.sum(diff, axis=1, keepdims=True) / m_train

    VdW = beta * VdW_prev + (1 - beta) * dW
    Vdb = beta * Vdb_prev + (1 - beta) * db

    SdW = beta * SdW_prev + (1 - beta) * (dW ** 2)
    Sdb = beta * Sdb_prev + (1 - beta) * (db ** 2)
    
    W -= learning_rate * VdW / np.sqrt(SdW + 10**-8)
    b -= learning_rate * Vdb / np.sqrt(Sdb + 10**-8)

    return cost, W, b, VdW, Vdb, SdW, Sdb

In [17]:
def accuracy(y_pred, y_actu):
    y_actt = np.argmax(y_actu, axis=0)
    return np.mean((y_pred == y_actt))

In [19]:
W = np.zeros((784, 10))
b = np.zeros((10, 1))
mini_batch = 6000
prep_X, prep_Y = normalize(train_X, train_Y)
for i in range(1001):
    VdW_prev, Vdb_prev = 0, 0
    SdW_prev, Sdb_prev = 0, 0
    pred_Y = forward(W, b, prep_X)
    cost, W, b, VdW_prev, Vdb_prev, SdW_prev, Sdb_prev = backward_adam(pred_Y, prep_Y, W, b, prep_X, VdW_prev, Vdb_prev, SdW_prev, Sdb_prev)
    if i % 100 == 0:
        tr_X, tr_Y = prep_X, prep_Y
        vl_X, vl_Y = normalize(valid_X, valid_Y)
        tr_P = forward(W, b, tr_X, eval=True)
        vl_P = forward(W, b, vl_X, eval=True)

        print(i, cost, accuracy(tr_P, tr_Y), accuracy(vl_P, vl_Y))

0 0.6931471805599451 0.4043333333333333 0.40391666666666665
100 0.23895079686061724 0.7532083333333334 0.7551666666666667
200 0.15841948548062865 0.8145625 0.81725
300 0.121698255168277 0.848125 0.8488333333333333
400 0.1025564355843951 0.8685625 0.8720833333333333
500 0.09142674146749302 0.8802708333333333 0.8831666666666667
600 0.08441277273079045 0.8884791666666667 0.8903333333333333
700 0.07973887140525278 0.8945625 0.8964166666666666
800 0.0764889632246929 0.8993541666666667 0.8993333333333333
900 0.07414064162725889 0.9022916666666667 0.9021666666666667


In [20]:
val_prep_X, val_prep_Y = normalize(valid_X, valid_Y)
valid_preds = forward(W, b, val_prep_X, eval=True)
accuracy(valid_preds, val_prep_Y)

0.9043333333333333

In [21]:
test_prep_X, test_prep_Y = normalize(test_X, test_Y)
test_preds = forward(W, b, test_prep_X, eval=True)
accuracy(test_preds, test_prep_Y)

0.9094