In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
## Those are the libraries that we will use to create our model and train it.


In [19]:
##First, we will import the dataset.
data = pd.read_csv('train.csv')

#Must convert pandas dataframe to numpy array for matrix algebra.

data = data.to_numpy()
m,n = data.shape
print(m,n)

42000 785


In [25]:
#Now that we have the data, we will split it into the training and testing data.
np.random.shuffle(data)
train_data = data[:int(0.95*m),:]
test_data = data[int(0.95*m):,:]
# Using 95% of the data in training and 5% in testing. skewed split towards training since this is a well known dataset.

test_data.shape
# Gives us 2100 samples to test performance of our model.

(2100, 785)

In [64]:
#Now we will split the data into the input and output.
X_train = train_data[:,1:].T
Y_train = train_data[:,0].T
X_test = test_data[:,1:].T
Y_test = test_data[:,0].T

print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)


(784, 39900) (39900,)
(784, 2100) (2100,)


In [65]:
#One hot encoding the output.
from tkinter.tix import Y_REGION


def one_hot(Y):
    Y = Y.astype(int)
    Y_one_hot = np.zeros((Y.shape[0],10))
    for i in range(Y.shape[0]):
        Y_one_hot[i,Y[i]] = 1
    return Y_one_hot.T
one_hot(Y_train).shape


(10, 39900)

In [92]:
#  Model architecture
##  Input layer: 784 neurons (28x28 pixels) (784 inputs)
##  Hidden layer: 100 neurons (ReLU activation)
##  Output layer: 10 neurons (0-9) (Softmax activation) 



# Implementation:
#  step 1: Initialize weights and biases
#  step 2: Forward propagation
#  step 3: Backpropagation
#  step 4: Update weights and biases
 
 
def init_params():   # step 1  
    W1 = np.random.rand(10,784)
    b1 = np.zeros((10,1))
    W2 = np.random.rand(10,10)
    b2 = np.zeros((10,1))
    return W1,b1,W2,b2

def forward_prop(X_train, W1, b1, W2, b2): #step 2
    Z1 = np.dot(W1,X_train) + b1
    A1 = np.maximum(0,Z1) #ReLU activation: if Z1 is positive, else 0.
    Z2 = np.dot(W2,A1) + b2
    A2 = np.exp(Z2)/np.sum(np.exp(Z2),axis=0) #Softmax activation: derived from the softmax formuula (e^Z2/sum(e^Z2))
    return Z1,A1, Z2, A2

def backprop(X_train, Y_train, Z1,Z2,A1,A2, W2): #step 3
    one_hot_y = one_hot(Y_train)
    dZ2 = A2 - one_hot_y
    dW2 = 1/Y_train.size * np.dot(dZ2,A1.T)
    db2 = 1/Y_train.size * np.sum(dZ2,axis=1,keepdims=True)
    dZ1 = np.dot(W2.T,dZ2) * (A1 > 0) #ReLU derivative: if A1 is positive, else 0.
    dW1 = 1/Y_train.size * np.dot(dZ1,X_train.T)
    db1 = 1/Y_train.size * np.sum(dZ1,axis=1,keepdims=True)
    return dW1,db1,dW2,db2

def update_params(W1,b1,W2,b2,dW1,db1,dW2,db2,alpha): #step 4
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2
    return W1,b1,W2,b2
    


In [104]:
  #now implementing gradient descent.
def gradient_descent(X_train, Y_train, alpha, iter):
    W1,b1,W2,b2 = init_params()
    for i in range(iter):
      Z1,A1,Z2,A2 = forward_prop(X_train,W1,b1,W2,b2)
      dW1,db1,dW2,db2 = backprop(X_train, Y_train, Z1, Z2, A1, A2, W2)
      W1,b1,W2,b2 = update_params(W1,b1,W2,b2,dW1,db1,dW2,db2,alpha)
      ## Output logging to help debug and see the training progress.  
      if i % 5 == 0:
        print("Iteration: ",i) # prints iteration number every 10 iterations.
        print('prediction: ', np.argmax(A2,0).sum()) # prints prediction every 10 iterations.
    return W1,b1,W2,b2


In [105]:
W1,b1, W2,b2 = gradient_descent(X_train, Y_train, .1, 200)

  A2 = np.exp(Z2)/np.sum(np.exp(Z2),axis=0) #Softmax activation: derived from the softmax formuula (e^Z2/sum(e^Z2))
  A2 = np.exp(Z2)/np.sum(np.exp(Z2),axis=0) #Softmax activation: derived from the softmax formuula (e^Z2/sum(e^Z2))


Iteration:  0
prediction:  0
Iteration:  5
prediction:  0
Iteration:  10
prediction:  0
Iteration:  15
prediction:  0
Iteration:  20
prediction:  0
Iteration:  25
prediction:  0
Iteration:  30
prediction:  0
Iteration:  35
prediction:  0
Iteration:  40
prediction:  0
Iteration:  45
prediction:  0
Iteration:  50
prediction:  0
Iteration:  55
prediction:  0
Iteration:  60
prediction:  0
Iteration:  65
prediction:  0
Iteration:  70
prediction:  0
Iteration:  75
prediction:  0
Iteration:  80
prediction:  0
Iteration:  85
prediction:  0
Iteration:  90
prediction:  0
Iteration:  95
prediction:  0
Iteration:  100
prediction:  0
Iteration:  105
prediction:  0
Iteration:  110
prediction:  0
Iteration:  115
prediction:  0
Iteration:  120
prediction:  0
Iteration:  125
prediction:  0
Iteration:  130
prediction:  0
Iteration:  135
prediction:  0
Iteration:  140
prediction:  0
Iteration:  145
prediction:  0
Iteration:  150
prediction:  0
Iteration:  155
prediction:  0
Iteration:  160
prediction:  0

In [98]:
print (W1,b1,W2,b2)

[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]] [[nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]] [[nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]] [[nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]]
