In [20]:
from keras.datasets import mnist
import numpy as np

In [6]:
(X_train_orig, Y_train), (X_test_orig, Y_test) = mnist.load_data()

In [8]:
print(X_train_orig.shape)
print(Y_train.shape)

(60000, 28, 28)
(60000,)


In [168]:
# Y_train array of only zero digits
Y_train_zeros = Y_train
for i, digit in enumerate(Y_train):
    if digit == 0:
        Y_train_zeros[i] = 1
    else:
        Y_train_zeros[i] = 0

In [169]:
Y_train_zeros

array([0, 1, 0, ..., 0, 0, 0], dtype=uint8)

In [36]:
# turn X_train and X_test into 2-dim matrix
X_train = X_train_orig.reshape(X_train_orig.shape[1] * X_train_orig.shape[2], X_train_orig.shape[0])
X_test = X_test_orig.reshape(X_test_orig.shape[1] * X_test_orig.shape[2], X_test_orig.shape[0])

print(X_train.shape)
print(X_test.shape)

# normalize data from 0-255 to 0-1
X_train_norm = X_train / 255
X_test_norm = X_test / 255

(784, 60000)
(784, 10000)


In [99]:
X_train_norm.shape

(784, 60000)

Each epoch will contain:

1) calculate Z (linear)

2) calculate A (sigmoid)

3) apply cost fuction (log-loss)

4) take derivative with respect to every w and b

5) apply corrections with learning rate


In [170]:
# initialize W and b to zeros
# b is just 0, W is shape (m, 1)

def initialize_W_b(X):
    W = np.zeros((X.shape[0], 1))
    b = 0
    
    return W, b

In [171]:
def propagate(X, Y, W, b):
    # m = number of cases
    m = X.shape[1]
    
    # 1) calculate Z (linear)
    # 1.1) Z = W^T dot X + b
    Z = np.dot(np.transpose(W), X) + b
    
    # 2) calcualte A (sigmoid)
    A = 1/(1 + np.exp(Z))
    # 3) apply cost function (log-loss)
    # 3.1) L = -y(log(a)) - (1-y)log(1-a)
    # 3.2) cost = -(1/m) sum(y(log(a)) + (1-y)log(1-a))
    cost = -(1 / m) * np.sum((Y * np.log(A)) + ((1 - Y) * np.log(1 - A)))
    
    # 4) take derivative with respec to every w and b
    # 4.1) d_w = (1/m) * x(a - y); d_b = (1/m) * (a-y)
    d_W = (1 / m) * np.dot(X, np.transpose(A - Y))
    d_b = (1 / m) * np.sum(A - Y)
    
    return d_W, d_b, cost

In [172]:
def optimize(X, Y, W, b, learning_rate=0.01, epochs=100):
    
    for i in range(epochs):
        d_W, d_b, cost = propagate(X, Y, W, b)
        
        # 5) apply corrections with learning rate
        W = W - (learning_rate * d_W)
        b = b - (learning_rate * d_b)
        
        if((i+1)%10 == 0):
            print(f'After epoch {i + 1}, cost = {cost}')
    
    return W, b

In [178]:
W, b = initialize_W_b(X_train_norm)

In [179]:
d_W, d_b = optimize(X_train_norm, Y_train_zeros, W, b, learning_rate=0.0001, epochs=1000)

After epoch 10, cost = 0.6952533412432762
After epoch 20, cost = 0.6976097451190808
After epoch 30, cost = 0.6999833484005908
After epoch 40, cost = 0.7023742759107157
After epoch 50, cost = 0.7047826530613372
After epoch 60, cost = 0.707208605847334
After epoch 70, cost = 0.709652260840356
After epoch 80, cost = 0.712113745182357
After epoch 90, cost = 0.7145931865788722
After epoch 100, cost = 0.7170907132920399
After epoch 110, cost = 0.7196064541333662
After epoch 120, cost = 0.7221405384562256
After epoch 130, cost = 0.7246930961480958
After epoch 140, cost = 0.7272642576225224
After epoch 150, cost = 0.7298541538108119
After epoch 160, cost = 0.732462916153445
After epoch 170, cost = 0.7350906765912139
After epoch 180, cost = 0.7377375675560718
After epoch 190, cost = 0.7404037219616997
After epoch 200, cost = 0.7430892731937799
After epoch 210, cost = 0.7457943550999792
After epoch 220, cost = 0.7485191019796327
After epoch 230, cost = 0.7512636485731315
After epoch 240, cost = 

In [151]:
a = np.array([[1, 2, 3]])
x = np.array([[10, 100, 1000],
             [20, 200, 2000],
             [30, 300, 3000]])
print(np.dot(x, np.transpose(a)))

[[3210]
 [6420]
 [9630]]
