In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data = np.array(data)
m, n = data.shape

np.random.shuffle(data)
print(f'Examples: {m} Features & Target: {n}')

Examples: 42000 Features & Target: 785


In [5]:
# Change the data such that each column is an example - (784, 42000)
data_val = data[0:1000, :].T
X_dev = data_val[1:n, :]
Y_dev = data_val[0, :]

In [6]:
data_train = data[1000:m, :].T
X_train = data_train[1:n, :]
X_train = X_train / 255.0

Y_train = data_train[0, :]

In [7]:
print(f'Train data: {X_train.shape} Train labels: {Y_train.shape}')
print(f'Dev set data: {X_dev.shape} Test labels: {Y_dev.shape}')

Train data: (784, 41000) Train labels: (41000,)
Dev set data: (784, 1000) Test labels: (1000,)


In [8]:
def ReLU(Z):
    return np.maximum(Z,0)

def derivative_ReLU(Z):
    return Z > 0

def softmax(Z):
    exp = np.exp(Z - np.max(Z))
    return exp / exp.sum(axis=0)

def init_params(size):
    W1 = np.random.rand(10, size) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return W1,b1,W2,b2

def forward_propagation(X, W1, b1, W2, b2):
    Z1 = W1.dot(X) + b1 #10, m
    A1 = ReLU(Z1) # 10,m
    Z2 = W2.dot(A1) + b2 #10,m
    A2 = softmax(Z2) #10,m
    return Z1, A1, Z2, A2

def one_hot(Y):
    one_hot_Y = np.zeros((Y.max()+1, Y.size))
    one_hot_Y[Y, np.arange(Y.size)] = 1 
    return one_hot_Y

def backward_propagation(X, Y, A1, A2, W2, Z1, m):
    one_hot_Y = one_hot(Y)
    # Derivative of categorical cross entropy & softmax is just (A2 - y)
    dZ2 = (A2 - one_hot_Y) # 10, m
    dW2 = 1/m * (dZ2.dot(A1.T)) # 10, 10
    db2 = 1/m * np.sum(dZ2,1) # 10, 1
    dZ1 = W2.T.dot(dZ2) * derivative_ReLU(Z1) # 10, m
    dW1 = 1/m * (dZ1.dot(X.T)) #10, 784
    db1 = 1/m * np.sum(dZ1,1) # 10, 1

    return dW1, db1, dW2, db2

def update_params(alpha, W1, b1, W2, b2, dW1, db1, dW2, db2):
    W1 -= alpha * dW1
    b1 -= alpha * np.reshape(db1, (10,1))
    W2 -= alpha * dW2
    b2 -= alpha * np.reshape(db2, (10,1))

    return W1, b1, W2, b2

def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y)/Y.size

def gradient_descent(X, Y, alpha, iterations):
    size , m = X.shape

    W1, b1, W2, b2 = init_params(size)
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_propagation(X, W1, b1, W2, b2)
        
        dW1, db1, dW2, db2 = backward_propagation(X, Y, A1, A2, W2, Z1, m)

        W1, b1, W2, b2 = update_params(alpha, W1, b1, W2, b2, dW1, db1, dW2, db2)   

        if (i+1) % int(iterations/10) == 0:
            print(f"Iteration: {i+1} / {iterations}")
            prediction = get_predictions(A2)
            print(f'{get_accuracy(prediction, Y):.3%}')
    return W1, b1, W2, b2


In [12]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 0.15, 500)

Iteration: 50 / 500
55.968%
Iteration: 100 / 500
72.832%
Iteration: 150 / 500
78.580%
Iteration: 200 / 500
81.227%
Iteration: 250 / 500
83.041%
Iteration: 300 / 500
84.200%
Iteration: 350 / 500
85.100%
Iteration: 400 / 500
85.841%
Iteration: 450 / 500
86.439%
Iteration: 500 / 500
87.024%
