In [20]:
from keras.datasets import mnist
import numpy as np

In [6]:
(X_train_orig, Y_train), (X_test_orig, Y_test) = mnist.load_data()

In [8]:
print(X_train_orig.shape)
print(Y_train.shape)

(60000, 28, 28)
(60000,)


In [36]:
# turn X_train and X_test into 2-dim matrix
X_train = X_train_orig.reshape(X_train_orig.shape[1] * X_train_orig.shape[2], X_train_orig.shape[0])
X_test = X_test_orig.reshape(X_test_orig.shape[1] * X_test_orig.shape[2], X_test_orig.shape[0])

print(X_train.shape)
print(X_test.shape)

# normalize data from 0-255 to 0-1
X_train_norm = X_train / 255
X_test_norm = X_test / 255

(784, 60000)
(784, 10000)


In [99]:
X_train_norm.shape

(784, 60000)

Each epoch will contain:

1) calculate Z (linear)

2) calculate A (sigmoid)

3) apply cost fuction (log-loss)

4) take derivative with respect to every w and b

5) apply corrections with learning rate


In [135]:
# initialize W and b to zeros
# b is just 0, W is shape (m, 1)

def initialize_W_b(X):
    W = np.zeros((X.shape[0], 1))
    b = 0
    
    return W, b

In [136]:
def propagate(X, Y, W, b):
    # m = number of cases
    m = X.shape[1]
    
    # 1) calculate Z (linear)
    # 1.1) Z = W^T dot X + b
    Z = np.dot(np.transpose(W), X) + b
    
    # 2) calcualte A (sigmoid)
    A = 1/(1 + np.exp(Z))
    
    # 3) apply cost function (log-loss)
    # 3.1) L = -y(log(a)) - (1-y)log(1-a)
    # 3.2) cost = -(1/m) sum(y(log(a)) + (1-y)log(1-a))
    cost = -(1 / m) * np.sum((Y * np.log(A)) + ((1 - Y) * np.log(1 - A)))
    
    # 4) take derivative with respec to every w and b
    # 4.1) d_w = (1/m) * x(a - y); d_b = (1/m) * (a-y)
    d_W = (1 / m) * np.dot(X, np.transpose(A - Y))
    d_b = (1 / m) * np.sum(A - Y)
    
    return d_W, d_b, cost

In [137]:
def optimize(X, Y, W, b, learning_rate=0.01, epochs=100):
    
    for i in range(epochs):
        d_W, d_b, cost = propagate(X, Y, W, b)
        
        # 5) apply corrections with learning rate
        W = W - (learning_rate * d_W)
        b = b - (learning_rate * d_b)
        
        if((i+1)%10 == 0):
            print(f'After epoch {i + 1}, cost = {cost}')
    
    return W, b

In [138]:
W, b = initialize_W_b(X_train_norm)

In [139]:
d_W, d_b = optimize(X_train_norm, Y_train, W, b, learning_rate=0.001, epochs=100)

After epoch 10, cost = 96.82477916362095
After epoch 20, cost = 62.264034165609026
After epoch 30, cost = 40.75149577276785
After epoch 40, cost = 28.915731517642893
After epoch 50, cost = 23.384961138338426
After epoch 60, cost = 21.61584617214714
After epoch 70, cost = 21.97334665533013
After epoch 80, cost = 23.497031548772878
After epoch 90, cost = 25.649935637986662
After epoch 100, cost = 28.139608375583062
