In [1]:
import struct
import numpy as np
import matplotlib.pyplot as plt
import os
from datetime import datetime


**Define Data loader & Preprocessing Steps**

In [2]:
path = os.path.join(os.path.expanduser('~'), 'Documents', 'OR 610')
def read_idx(filename):
    with open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.fromstring(f.read(), dtype=np.uint8).reshape(shape)
    
def oneHotEncoding(label):
    n = np.max(label)+1
    v = np.eye(n)[label]
    return v.T


def imageProcess(data):
    data = data/255
    data = data.reshape(data.shape[0],data.shape[1]*data.shape[2])
    return data.T

**Define activation functions for forward pass**

In [3]:
def softMax(X):
    e = np.exp(X)
    p = e/np.sum(e, axis=0)
    return p

def ReLU(z):
    return np.maximum(0,z)


def sigmoid(z):
    return 1./(1.+np.exp(-z))


def tanh(z):
    return np.tanh(z)



**Define Activation functions for backward pass i.e. first derivative of the forward pass activation function**

In [4]:
def dReLU(z):
    return (z > 0) * 1

def dSigmoid(z):
    return sigmoid(z) *(1-sigmoid (z))

def dTanh(z):
    return 1/(np.cosh(z)**2)

Multi label cross entropy with L2 regularization

**Model Procedures**

*Forward Pass:*

\\(Z_i = W_i \bullet x^T + b_i \\)

\\(A_i = \sigma(Z_i)\\)

\\(\hat{y} = A_i\\)

where \\(\sigma\\) is a nonlinear transformation

*Loss Function* with regularization

\\[L(y,\hat{y}) = -\frac{1}{m} \Sigma_j \Sigma_i y_i log(\hat{y_i}) + \frac{\lambda}{2*m} * (
\Sigma_w w^2)\\]

*Back propagation: here we use differental equations and use the chain rule first starting with the cost function and work backwards until we get to weights since we want to learn the weights that give a better fit*

\\[\frac{\delta L}{\delta w_i} = \frac{\delta L}{\delta \hat{y}} * \frac{\delta \hat{y}}{\delta z} * \frac{\delta z}{\delta w_i}\\]

*Update weights*

\\[w_i = w_i - \eta * \delta w_i - \frac {(w_i * \lambda * \eta)}{m}\\]

where \\(\eta\\) is the learning rate

In [5]:
def crossEntropyR2(y, y_hat, lamda, params):
    m = y.shape[1]
    cost = -(1/m) * np.sum(y*np.log(y_hat)) + lamda/(2*m) * (np.sum(params['W1']**2) + np.sum(params['W2']**2))
    return cost

def forward(X,params,activation):

    forwardPass = {}
    forwardPass['Z1'] = np.matmul(params['W1'], X) + params['b1']
    forwardPass['A1'] = activation(forwardPass['Z1'])
    forwardPass['Z2'] = np.matmul(params['W2'],forwardPass['A1']) + params['b2']
    forwardPass['A2'] = softMax(forwardPass['Z2'])
    return forwardPass


def back(X, y,forwardPass, params,dActivation):
    m = X.shape[1]
    gradient = {}
    gradient['dZ2'] = forwardPass['A2'] - y
    gradient['dW2'] = (1./m) * np.matmul(gradient['dZ2'], forwardPass['A1'].T)
    gradient['db2'] = (1./m) * np.sum(gradient['dZ2'], axis=1, keepdims=True)
    gradient['dA1'] = np.matmul(params['W2'].T, gradient['dZ2'])
    gradient['dZ1'] = gradient['dA1'] * dActivation(forwardPass['Z1'])
    gradient['dW1'] = (1./m) * np.matmul(gradient['dZ1'], X.T)
    gradient['db1'] = (1./m) * np.sum(gradient['dZ1'])
    return gradient

def updater(params,grad,eta,lamda,m):
    updatedParams = {}
    updatedParams['W2'] = params['W2'] - eta * grad['dW2'] - (params['W2']*lamda*eta)/m
    updatedParams['b2'] = params['b2'] - eta * grad['db2']
    updatedParams['W1'] = params['W1'] - eta * grad['dW1'] - (params['W1']*lamda*eta)/m
    updatedParams['b1'] = params['b1'] - eta * grad['db1']
    return updatedParams

def classifer(X, params,activation):
    Z1 = np.matmul(params['W1'], X) + params['b1']
    A1 = activation(Z1)
    Z2 = np.matmul(params['W2'],A1) + params['b2']
    A2 = softMax(Z2)
    pred = np.argmax(A2, axis=0)
    return pred


Load Data to memory and define hyper params

In [6]:

X_train = imageProcess(read_idx(path+'/train-images.idx3-ubyte'))
y_train = oneHotEncoding(read_idx(path+'/train-labels-idx1-ubyte'))
X_test = imageProcess(read_idx(path+'/t10k-images-idx3-ubyte'))
y_test = read_idx(path+'/t10k-labels-idx1-ubyte')

#### General Hyperparameters
m=10000 #batch size
n_x = X_train.shape[0]
n_h = 100
eta = 1
lamda = 2
np.random.seed(7)
epoch = 1000


  


Sigmoid - Activation function

In [7]:
#m = X_train.shape[1]
#Initializing weightss
sigmoidParams = {'W1': np.random.randn(n_h, n_x)* np.sqrt(1. / n_x),
                 'b1': np.zeros((n_h, 1)),
                 'W2': np.random.randn(10, n_h)* np.sqrt(1. / n_h),
                 'b2': np.zeros((10, 1))
                 }

start = datetime.now()
for i in range(epoch):
    #shuffle batch index
    idx = np.random.permutation(X_train.shape[1])[:m]
    X=X_train[:,idx]
    y=y_train[:,idx]
    #forward pass
    forwardPass = forward(X,sigmoidParams,sigmoid)
    #cost
    cost = crossEntropyR2(y, forwardPass['A2'], lamda, sigmoidParams)
    #back Prop
    gradient = back(X, y, forwardPass, sigmoidParams,dSigmoid)
    #updating weights
    sigmoidParams=updater(sigmoidParams,gradient,eta,lamda,m)
difference = datetime.now() - start
print("Final cost:", cost)
print('time to train:', difference)

y_hat = classifer(X_test, sigmoidParams, sigmoid)


print('Accuracy:',sum(y_hat==y_test)*1/len(y_test))


Final cost: 0.2308652789490603
time to train: 0:00:51.425122
Accuracy: 0.9478


ReLU Activation Function

In [8]:
#######RELU SECTION ############
reluParams = {'W1': np.random.randn(n_h, n_x)* np.sqrt(2. / n_x),
                 'b1': np.zeros((n_h, 1)),
                 'W2': np.random.randn(10, n_h)* np.sqrt(2. / n_h),
                 'b2': np.zeros((10, 1))
                 }

start = datetime.now()
for i in range(epoch):
    #shuffle batch index
    idx = np.random.permutation(X_train.shape[1])[:m]
    X=X_train[:,idx]
    y=y_train[:,idx]
    #forward pass
    forwardPass = forward(X,reluParams,ReLU)
    #cost
    cost = crossEntropyR2(y, forwardPass['A2'], lamda, reluParams)
    #back Prop
    gradient = back(X, y, forwardPass, reluParams,dReLU)
    #updating weights
    reluParams=updater(reluParams,gradient,eta,lamda,m)
difference = datetime.now() - start
print("Final cost:", cost)
print('time to train:', difference)

y_hat = classifer(X_test, reluParams, ReLU)


print('Accuracy:',sum(y_hat==y_test)*1/len(y_test))



Final cost: 0.10742655805231126
time to train: 0:00:40.840828
Accuracy: 0.9721


Tanh Activation Function

In [9]:
#######tanh SECTION ############
tanhParams = {'W1': np.random.randn(n_h, n_x)* np.sqrt(1. / n_x),
                 'b1': np.zeros((n_h, 1)),
                 'W2': np.random.randn(10, n_h)* np.sqrt(1. / n_h),
                 'b2': np.zeros((10, 1))
                 }

start = datetime.now()
for i in range(epoch):
    #shuffle batch index
    idx = np.random.permutation(X_train.shape[1])[:m]
    X=X_train[:,idx]
    y=y_train[:,idx]
    #forward pass
    forwardPass = forward(X,tanhParams,tanh)
    #cost
    cost = crossEntropyR2(y, forwardPass['A2'], lamda, tanhParams)
    #back Prop
    gradient = back(X, y, forwardPass, tanhParams,dTanh)
    #updating weights
    tanhParams=updater(tanhParams,gradient,eta,lamda,m)
difference = datetime.now() - start
print("Final cost:", cost)
print('time to train:', difference)

y_hat = classifer(X_test, tanhParams, tanh)


print('Accuracy:',sum(y_hat==y_test)*1/len(y_test))

Final cost: 0.12754923340771634
time to train: 0:00:39.966550
Accuracy: 0.968
