In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
data = pd.read_csv('./fashion-mnist_test.csv')
x = data.values
X = x[:,1:]
y = x[:,0]

X = X/255      ## Normalizing the data

print(X.shape,y.shape)

(10000, 784) (10000,)


In [4]:
## splitting the training and testing data

X_train = X[:8000,:]
y_train = y[:8000]

X_test = X[8000:,:]
y_test = y[8000:]

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(8000, 784) (8000,)
(2000, 784) (2000,)


In [5]:
classes = np.unique(y)
print(classes)

[0 1 2 3 4 5 6 7 8 9]


In [6]:
Input_layer = 784                    ## clearly there are 784 features for each example
H1_size = 256                        ## 1st hidden layer
H2_size = 64                         ## 2nd hidden layer
Out = 10                             ## there are 10 unique classes to be predicted

Batch_size = 200                     ## contains this many at once from the data
Epoch = 10                           ## for every data the is trained this many times
LR = 0.004                           ## learning rate

In [7]:
## initialising the weights and biases randomly

def __init__():
    
    np.random.seed(0)
    model = {}
    
    model["w1"] = np.random.randn(Input_layer,H1_size)/np.sqrt(Input_layer)  ## thumb rule to divide by sqrt of col size
    model["b1"] = np.zeros((1,H1_size))
    model["w2"] = np.random.randn(H1_size,H2_size)/np.sqrt(H1_size)
    model["b2"] = np.zeros((1,H2_size)) 
    model["w3"] = np.random.randn(H2_size,Out)/np.sqrt(H2_size)
    model["b3"] = np.zeros((1,Out))    
    
    return model

In [8]:
def forward_prop(model,X):
    
    z1 = X.dot(model["w1"] + model["b1"])
    a1 = np.tanh(z1)
    z2 = X.dot(model["w2"] + model["b2"])
    a2 = np.tanh(z2)
    z3 = X.dot(model["w3"] + model["b3"])
    
    ## taking softmax of the z3
    h_x = np.exp(z3)
    y_out = h_x/ np.sum(h_x, axis=1, keepdims=True)  ## summing vals along the column
    
    return a1,a2,y_out

In [None]:
def back_prop(model, x, a1, a2, y_out, y_act):     ## in one-hot encoding already
    
    delta3 = y_out
    delta3[range(y_act.shape[0]), y_act] -= 1      
    dw3 = (a2.T).dot(delta3)
    db3 = np.sum(delta3, axis=0)
    delta2 = (1-np.square(a2))*delta3.dot(model["W3"].T)
    dw2 = (a1.T).dot(delta2)
    db2 = np.sum(delta2, axis=0)
    delta1 = (1-np.square(a1))*delta2.dot(model["W2"].T)
    dw1 = (x.T).dot(delta1)
    db1 = np.sum(delta1, axis=0)
    
    model["W3"] -= LR*dw3
    model["B3"] -= LR*db3
    model["W2"] -= LR*dw2
    model["B2"] -= LR*db2
    model["W1"] -= LR*dw1
    model["B1"] -= LR*db1
    
    return model