In [8]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.io import arff

# Loading data into dataframe and making it work fine
df, meta = arff.loadarff('./adult-small.arff')
df = pd.DataFrame(df)
str_df = df.select_dtypes([object])
str_df = str_df.stack().str.decode('utf-8').unstack()
for col in str_df:
    df[col] = str_df[col]
df = df.replace('?', np.NaN)

# numeric = 0, categorical = 1
attributeTypeDict = {
    'age': 0,
    'fnlwgt': 0,
    'education-num': 0,
    'capital-gain': 0,
    'capital-loss': 0,
    'hours-per-week': 0,
    'workclass': 1,
    'education': 1,
    'marital-status': 1,
    'occupation': 1,
    'relationship': 1,
    'race': 1,
    'sex': 1,
    'native-country': 1,
}

columns = df.columns

Replace Missing Values

In [9]:
def replaceMissing(dataframe):
    for col in columns:
        if not col == 'class':
            if attributeTypeDict.get(col) == 1:
                # this is a categorical variable, so replace with the mode
                dataframe[col] = dataframe[col].fillna(dataframe[col].mode()[0])
            else:
                # this is a numeric variable, so replace with the mean
                dataframe[col] = dataframe[col].fillna(dataframe[col].mean())
    return dataframe

Z Score Normalization and One Hot Encoding

In [10]:
def zScoreNormalize(dataframe):
    # z score normalization
    for column in dataframe.columns:
        if (attributeTypeDict.get(column) == 0):
            dataframe[column] = stats.zscore(dataframe[column])
    return dataframe

# one hot encoding
def oneHotPreprocess(dataframe):
    for column in dataframe.columns:
        if(attributeTypeDict.get(column) == 1):
            one_hot = pd.get_dummies(dataframe[column])
            dataframe = dataframe.drop(column, axis=1)
            dataframe = dataframe.join(one_hot)
    return dataframe

Activation Functions

In [11]:
def ReLU(data):
    return np.maximum(0, data)

def sigmoid(data):
    return np.exp(data)/(np.exp(data)+1)

Feed Forward and Backprop

In [12]:
def init_params(nodeCounts):
    # nodeCounts is an array containing how many nodes are in each layer
    weights = []
    biases = []

    # loop through the node counts and create weight and bias matrices between each layer
    for i in range(len(nodeCounts)-1):
        currentLayerNodeCount = nodeCounts[i]
        nextLayerNodeCount = nodeCounts[i+1]
        weightArr = np.random.randn(nextLayerNodeCount, currentLayerNodeCount)
        biasArr = np.random.randn(nextLayerNodeCount, 1)
        weights.append(weightArr)
        biases.append(biasArr)
    return weights, biases

def feed_forward(weightArray, biasArray, data):
    activatedData = []
    unactivatedData = []
    for i in range(len(weightArray)):
        if(i == 0):
            currentLayer = data
        else:
            currentLayer = activatedData[i-1]
        weightMatrix = weightArray[i]
        biasMatrix = biasArray[i]
        nextLayer = weightMatrix.dot(currentLayer) + biasMatrix
        unactivatedData.append(nextLayer)
        nextLayer = nextLayer.astype(float)
        nextLayer = sigmoid(nextLayer)
        activatedData.append(nextLayer)
    return activatedData, unactivatedData

# takes the class label column
def oneHotEncode(testData):
    classLabels = testData.copy()
    classLabels[classLabels == '>50K'] = 1
    classLabels[classLabels == '<=50K'] = 0
    npClassLabels = np.array(classLabels)
    return npClassLabels.T

def sigmoid_derivative(data):
    return sigmoid(data)*(1-sigmoid(data))

def backpropagation(inputData, activatedData, unactivatedData, weights, classLabels):
    weightDeltas = []
    biasDeltas = []
    entryCount = classLabels.size
    previousError = np.array([])
    one_hot = oneHotEncode(classLabels)
    for i in reversed(range(len(weights))):
        if (i == len(weights)-1):
            # we're at the output layer and the error is diff between expected and actual outcomes
            error = activatedData[i] - one_hot
            deltaWeight = 1/entryCount * error.dot(activatedData[i-1].T)
            deltaBias = 1/entryCount * np.sum(error, 1)
            previousError = error
            weightDeltas.append(deltaWeight)
            biasDeltas.append(deltaBias.reshape(-1, 1))
        elif i == 0:
            # we are not at the output layer and need to calculate the derivative of the sigmoid function to find the error
            error = weights[i+1].T.dot(previousError) * sigmoid_derivative(unactivatedData[i].astype(float))
            deltaWeight = 1/entryCount * error.dot(inputData.T)
            deltaBias = 1/entryCount * np.sum(error, 1)
            previousError = error
            weightDeltas.insert(0, deltaWeight)
            biasDeltas.insert(0, deltaBias.reshape(-1, 1))
        else:
            error = weights[i+1].T.dot(previousError) * sigmoid_derivative(unactivatedData[i].astype(float))
            deltaWeight = 1/entryCount * error.dot(activatedData[i-1].T)
            deltaBias = 1/entryCount * np.sum(error, 1)
            previousError = error
            weightDeltas.insert(0, deltaWeight)
            biasDeltas.insert(0, deltaBias.reshape(-1, 1))
    return weightDeltas, biasDeltas

def updateParams(weights, biases, dW, dB, learningRate):
    for i in range(len(weights)):
        weights[i] = weights[i] - dW[i]*learningRate
        biases[i] = biases[i] - dB[i]*learningRate
    return weights, biases

Run Network

In [13]:
# randomize
df = df.sample(frac=1)

# separate the data and class labels
classLabel = df['class']
df = df.drop('class', axis=1)

# get the true outputs to check against
npClassLabels = oneHotEncode(classLabel)

# replace missing values
df = replaceMissing(df)

# z score normalize
df = zScoreNormalize(df)

# one hot encode
df = oneHotPreprocess(df)

# get the input data
numpyData = df.to_numpy().T
testingData = numpyData.copy()

# create the topology of the network
rowCount, columnCount = numpyData.shape
nodeCountArr = [rowCount, 100, 1]

def getAccuracy(outputs, classLabels):
    roundedOutputs = np.rint(outputs)
    return 1-np.count_nonzero(roundedOutputs[0]-np.array(classLabels))/outputs.size

def run_network(iterations, learningRate, nodeCountArr, inputData, classLabels):
    # get initial weights and biases
    weights, biases = init_params(nodeCountArr)
    for i in range(iterations):
        activatedData, unactivatedData = feed_forward(weights, biases, inputData)
        dW, dB = backpropagation(inputData, activatedData, unactivatedData, weights, classLabels)
        weights, biases = updateParams(weights, biases, dW, dB, learningRate)
        if(i%20 == 0):
            print('iteration number', i)
            print('accuracy is', getAccuracy(activatedData[len(activatedData)-1], npClassLabels), '\n')
    return weights, biases

w, b = run_network(100, 2, nodeCountArr, testingData, classLabel)

activatedData, unactivatedData = feed_forward(w, b, testingData)
print(getAccuracy(activatedData[len(activatedData)-1], npClassLabels))

iteration number 0
accuracy is 0.7623762376237624 

iteration number 20
accuracy is 0.9108910891089109 

iteration number 40
accuracy is 1.0 

iteration number 60
accuracy is 1.0 

iteration number 80
accuracy is 1.0 

1.0
