In [57]:
import os
import math
import pandas as pd
import numpy as np

### Read in Data

In [58]:
path = os.getcwd()
print(path)
path = os.path.normpath(os.getcwd() + os.sep + os.pardir)
print(path)
path += '/data'
print(path)

/Users/arminbazarjani/Desktop/CurrentClasses/csci561/homework/hw3/code
/Users/arminbazarjani/Desktop/CurrentClasses/csci561/homework/hw3
/Users/arminbazarjani/Desktop/CurrentClasses/csci561/homework/hw3/data


In [59]:
trainX_in = pd.read_csv(filepath_or_buffer=path+'/train_image.csv', header=None)
trainY_in = pd.read_csv(filepath_or_buffer=path+'/train_label.csv', header=None)
testX_in = pd.read_csv(filepath_or_buffer=path+'/test_image.csv', header=None)
testY_in = pd.read_csv(filepath_or_buffer=path+'/test_label.csv', header=None)

In [60]:
# output shapes
print(f'trainX shape: {trainX_in.shape} \ntrainY shape: {trainY_in.shape} \ntestX shape: {testX_in.shape} \ntestY shape: {testY_in.shape}')


trainX shape: (60000, 784) 
trainY shape: (60000, 1) 
testX shape: (10000, 784) 
testY shape: (10000, 1)


In [61]:
# Transpose the matrices to make it fit
trainX = trainX_in.T
trainY = trainY_in.T
testX = testX_in.T
testY = testY_in.T

In [62]:
# convert pandas dataframes to numpy arrays
trainX = trainX.values
trainY = trainY.values

In [63]:
testX = testX.to_numpy()
testY = testY.to_numpy()

In [64]:
# convert y matrix to one hot
onehotY = np.zeros((trainY.size, trainY.max()+1))
onehotY[np.arange(trainY.size), trainY] = 1
onehotY = onehotY.T

In [65]:
print(onehotY.shape)

(10, 60000)


In [66]:
onehotY_test = np.zeros((testY.size, testY.max()+1))
onehotY_test[np.arange(testY.size), testY] = 1
onehotY_test = onehotY_test.T

In [67]:
print(onehotY_test.shape)

(10, 10000)


In [68]:
trainX = trainX[:,:10000]
onehotY = onehotY[:,:10000]

In [69]:
print(trainX.shape)
print(onehotY.shape)

(784, 10000)
(10, 10000)


In [70]:
print(len(trainX[1]))

10000


In [71]:
# shuffle fun
random = np.arange(len(trainX[1]))
np.random.shuffle(random)
trainX_shuffle = trainX[:,random]
onehotY_shuffle = onehotY[:,random]

In [72]:
print(trainX_shuffle.shape)
print(onehotY_shuffle.shape)

(784, 10000)
(10, 10000)


### Sanity Check using Matplotlib

The shapes of the datasets check out. I think it's time to start initializing the neural network

### Make The Neural Network and initialize the weights

In [85]:
class MLP:
    def __init__(self, batch_size, learning_rate, num_epochs):
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        
        # intialize the weights and biases for the hidden layers
        # Currently using a network with three hidden layers of size 256 128
        self.params = {
            'W1':np.random.randn(512, 784) / np.sqrt(784 / 2.0),
            'b1':np.random.randn(512,1) / np.sqrt(784 / 2.0),
            'W2':np.random.randn(256, 512) / np.sqrt(512 / 2.0),
            'b2':np.random.randn(256,1) / np.sqrt(512 / 2.0),
            'W3':np.random.randn(10, 256) / np.sqrt(256 / 2.0),
            'b3':np.random.randn(10,1) / np.sqrt(256 / 2.0),
            'W4':np.random.randn(10,256) * np.sqrt(1.0 / 256),
            'b4':np.random.randn(10,1) * np.sqrt(1.0 / 256),
            
        }
        
    # define sigmoid activation function
    def sigmoid(self, x):
        return 1.0/(1.0 + np.exp(-x))
    
    # sigmoid backward
    def sigmoid_backward(self, dA, Z):
        sig = self.sigmoid(Z)
        return dA * sig * (1 - sig)
    
    # define softmax activation function
    def softmax(self, x):
        exps = np.exp(x - x.max())
        return exps / np.sum(exps, axis=0)
    
    # define cross-entropy cost function
    def cross_entropy_loss(self, Y, out):
        m = Y.shape[1]

        cost = (-1 / m) * np.sum(np.multiply(Y, np.log(out)) + np.multiply(1 - Y, np.log(1 - out)))
        cost = np.squeeze(cost)

        return cost
    
    # compute accuracy of model given x and y inputs
    def get_accuracy(self, X, y):
        predictions = []

        cache = self.forward_pass(X)
        output = cache['A3']
        pred = np.argmax(output, axis=0)
        predictions.append(pred == np.argmax(y, axis=0))
        
        return np.mean(predictions)
        
    
    # splits x and y into a list of mini-batches
    def get_mini_batches(self, X, y, batch_size):
        m = X.shape[1]
        mini_batches = list()
        num_batches = math.floor(m/batch_size)
        for i in range(0, num_batches):
            mb_X = X[:, i * batch_size : (i+1) * batch_size]
            mb_y = y[:, i * batch_size : (i+1) * batch_size]
            mini_batch = (mb_X, mb_y)
            mini_batches.append(mini_batch)

        # end case
        if m % batch_size != 0:
            mb_X = X[:, batch_size * math.floor(m / batch_size) : m]
            mb_y = y[:, batch_size * math.floor(m / batch_size) : m]
            mini_batch = (mb_X, mb_y)
            mini_batches.append(mini_batch)

        return mini_batches

    
    # forward pass
    def forward_pass(self, X):
        cache = dict()
        
        cache['Z1'] = np.dot(self.params['W1'], X) + self.params['b1']
        cache['A1'] = self.sigmoid(cache['Z1'])
        cache['Z2'] = np.dot(self.params['W2'], cache['A1']) + self.params['b2']
        cache['A2'] = self.sigmoid(cache['Z2'])
        cache['Z3'] = np.dot(self.params['W3'], cache['A2']) + self.params['b3']
        cache['A3'] = self.softmax(cache['Z3'])
        
        return cache
    
    # backward pass
    def backward_pass(self, X, Y, cache):
        m = X.shape[1]
        
        # error at last layer
        dZ3 = cache['A3'] - Y
        
        # gradients at last layer
        m3 = cache["A2"].shape[1]
        dW3 = np.dot(dZ3, cache["A2"].T) / m
        db3 = np.sum(dZ3, axis=1, keepdims=True) / m
        
        # back propagate through first layer
        dA2 = np.dot(self.params['W3'].T, dZ3)
        dZ2 = self.sigmoid_backward(dA2, cache['Z2'])
        
        # gradients of middle layer
        m2 = cache['A1'].shape[1]
        dW2 = np.dot(dZ2, cache['A1'].T) / m
        db2 = np.sum(dZ2, axis=1, keepdims=True) / m
        
        # back propagate through middle layer
        dA1 = np.dot(self.params['W2'].T, dZ2)
        dZ1 = self.sigmoid_backward(dA1, cache['Z1'])
        
        # gradients of first layer
        m1 = X.shape[1]
        dW1 = np.dot(dZ1, X.T) / m
        db1 = np.sum(dZ1, axis=1, keepdims=True) / m
    
        grads = {'dW3':dW3, 'db3':db3, 'dW2':dW2, 'db2':db2, 'dW1':dW1, 'db1':db1}
        
        return grads
        
        
    def train(self, X, y):
        # loop through number of iterations
        for i in range(0, self.num_epochs):
            # shuffle X and y
            random = np.arange(len(X[1]))
            np.random.shuffle(random)
            X_shuffle = X[:,random]
            y_shuffle = y[:,random]
            
            # get mini-batches of X and y
            mini_batches = self.get_mini_batches(X_shuffle, y_shuffle, self.batch_size)
            
            for mini_batch in mini_batches:
                mb_x, mb_y = mini_batch
                cache = self.forward_pass(mb_x)
                grads = self.backward_pass(mb_x, mb_y, cache) 

                # update parameters
                self.params['W1'] = self.params['W1'] - (self.learning_rate * grads['dW1'])
                self.params['b1'] = self.params['b1'] - (self.learning_rate * grads['db1'])
                self.params['W2'] = self.params['W2'] - (self.learning_rate * grads['dW2'])
                self.params['b2'] = self.params['b2'] - (self.learning_rate * grads['db2'])
                self.params['W3'] = self.params['W3'] - (self.learning_rate * grads['dW3'])
                self.params['b3'] = self.params['b3'] - (self.learning_rate * grads['db3'])
            
            # calculate train accuracy after each epoch
            train_accuracy = self.get_accuracy(X, y)
            print(f'Training accuracy for epoch {i+1}: {train_accuracy}')

In [86]:
# load that shit
NeuralNetwork = MLP(batch_size=64, learning_rate=0.01, num_epochs=80)

In [87]:
# train that shit
NeuralNetwork.train(trainX, onehotY)



Training accuracy for epoch 1: 0.6647
Training accuracy for epoch 2: 0.7583
Training accuracy for epoch 3: 0.8066
Training accuracy for epoch 4: 0.8436
Training accuracy for epoch 5: 0.8553
Training accuracy for epoch 6: 0.875
Training accuracy for epoch 7: 0.8895
Training accuracy for epoch 8: 0.8974
Training accuracy for epoch 9: 0.9042
Training accuracy for epoch 10: 0.9124
Training accuracy for epoch 11: 0.9148
Training accuracy for epoch 12: 0.9208
Training accuracy for epoch 13: 0.924
Training accuracy for epoch 14: 0.9307
Training accuracy for epoch 15: 0.9322
Training accuracy for epoch 16: 0.9348
Training accuracy for epoch 17: 0.9374
Training accuracy for epoch 18: 0.9394
Training accuracy for epoch 19: 0.942
Training accuracy for epoch 20: 0.9426
Training accuracy for epoch 21: 0.9462
Training accuracy for epoch 22: 0.9478
Training accuracy for epoch 23: 0.9494
Training accuracy for epoch 24: 0.9516
Training accuracy for epoch 25: 0.9527
Training accuracy for epoch 26: 0.954

In [None]:
# get predictions on test
test_accuracy = NeuralNetwork.get_accuracy(testX, onehotY_test)
print(test_accuracy)