In [1]:
import pandas as pd
import numpy as np

In [2]:
cols = (list(range(-1, 784)))
def prepare_ds(path, split=False, ratio=0.0):
    data = pd.read_csv(filepath_or_buffer=path, names=cols)
    data = data.sample(frac=1)
    if split == True:    
        d1X = np.array(data[:int(-data.shape[0] * ratio)][cols[1:]]).T
        d1Y = np.array(pd.get_dummies(data[:int(-data.shape[0] * ratio)][-1])[:]).T
        d2X = np.array(data[int(-data.shape[0] * ratio):][cols[1:]]).T
        d2Y = np.array(pd.get_dummies(data[int(-data.shape[0] * ratio):][-1])[:]).T
        return d1X, d1Y, d2X, d2Y
    if split == False:
        d1X = np.array(data[:][cols[1:]]).T
        d1Y = np.array(pd.get_dummies(data[:][-1])[:]).T
        return d1X, d1Y

In [3]:
data_path = '../digit-recognizer/mnist_train.csv'
test_path = '../digit-recognizer/mnist_test.csv'
train_X, train_Y, valid_X, valid_Y = prepare_ds(data_path, split=True, ratio=0.2)
test_X, test_Y, = prepare_ds(test_path)

In [4]:
m_train = train_X.shape[1]
m_test = valid_X.shape[1]
train_X.shape, train_Y.shape, valid_X.shape, valid_Y.shape

((784, 48000), (10, 48000), (784, 12000), (10, 12000))

In [5]:
def normalize(X, Y):
    X = (X - np.mean(X)) / (np.std(X))
    return X, Y

In [6]:
def compute_LogLoss(Yp, Yg):
    cost = -(Yg * np.log(Yp) + (1 - Yg) * np.log(1 - Yp))
    return np.mean(cost)

In [7]:
def forward(W, b, X, eval=False):
    Z = np.dot(W.T, X) + b
    A = 1. / (1 + np.exp(-Z))
    if eval == False:
        return A
    if eval == True:
        return np.argmax(A, axis=0)

In [38]:
def backward_mm(Yp, Yg, W, b, X, SdW_prev, Sdb_prev, learning_rate=0.001, beta=0.99):
    cost = compute_LogLoss(Yp.T, Yg.T)
    diff = Yp - Yg
    dW = np.dot(X, diff.T) / m_train
    db = np.sum(diff, axis=1, keepdims=True) / m_train

    SdW = beta * SdW_prev + (1 - beta) * (dW ** 2)
    Sdb = beta * Sdb_prev + (1 - beta) * (db ** 2)
    
    W -= learning_rate * dW / np.sqrt(SdW + 10**-8)
    b -= learning_rate * db / np.sqrt(Sdb + 10**-8)

    return cost, W, b, SdW, Sdb

In [39]:
def accuracy(y_pred, y_actu):
    y_actt = np.argmax(y_actu, axis=0)
    return np.mean((y_pred == y_actt))

In [40]:
W = np.zeros((784, 10))
b = np.zeros((10, 1))
mini_batch = 6000
prep_X, prep_Y = normalize(train_X, train_Y)   
for i in range(2000):
    SdW_prev, Sdb_prev = 0, 0
    pred_Y = forward(W, b, prep_X)
    cost, W, b, SdW_prev, Sdb_prev = backward_mm(pred_Y, prep_Y, W, b, prep_X, SdW_prev, Sdb_prev)
    if i % 100 == 0:
        tr_X, tr_Y = prep_X, prep_Y
        vl_X, vl_Y = normalize(valid_X, valid_Y)
        tr_P = forward(W, b, tr_X, eval=True)
        vl_P = forward(W, b, vl_X, eval=True)

        print(i, cost, accuracy(tr_P, tr_Y), accuracy(vl_P, vl_Y))

0 0.6931471805599451 0.4043541666666667 0.3993333333333333
100 0.18593674482618644 0.8394583333333333 0.83975
200 0.1821173713462767 0.8434791666666667 0.8448333333333333
300 0.17820278087718436 0.8465833333333334 0.8475
400 0.17469574680920383 0.8484583333333333 0.8496666666666667
500 0.1715803180515489 0.8506458333333333 0.8506666666666667
600 0.16879378577201415 0.8519791666666666 0.85225
700 0.166283974411588 0.85325 0.8545
800 0.16400949984299942 0.8544375 0.85625
900 0.16193710308443357 0.8555833333333334 0.8573333333333333
1000 0.1600393623928418 0.8565625 0.85875
1100 0.15829321721829964 0.8576666666666667 0.8596666666666667
1200 0.15667909634239158 0.8582916666666667 0.8605833333333334
1300 0.15518034713408738 0.8590625 0.8620833333333333
1400 0.15378280038624087 0.8599375 0.86275
1500 0.15247440978113067 0.8609375 0.863
1600 0.1512449394977835 0.8618541666666667 0.8635833333333334
1700 0.15008568705767047 0.8626041666666666 0.8639166666666667
1800 0.1489892395487013 0.863625 

In [15]:
val_prep_X, val_prep_Y = normalize(valid_X, valid_Y)
valid_preds = forward(W, b, val_prep_X, eval=True)
accuracy(valid_preds, val_prep_Y)

0.8925833333333333

In [16]:
test_prep_X, test_prep_Y = normalize(test_X, test_Y)
test_preds = forward(W, b, test_prep_X, eval=True)
accuracy(test_preds, test_prep_Y)

0.9006