# Read Data

In [1]:
import pandas as pd
import numpy as np
import csv
from timeit import default_timer as timer 


df = pd.read_csv('bank_data_train.csv', skiprows=1)
trainArray = df.to_numpy()


file = open('bank_data_train.csv')
reader = csv.reader(file);
file.close()

df = pd.read_csv('bank_data_test.csv', skiprows=1)
finalTestingData = df.to_numpy()


In [2]:
def change_str_to_index(dataset, columnIndex):
    unique = list()
    for row in dataset:
        s = str(row[columnIndex])
        if (s == 'nan'):
            continue
        s = s.lower()
        if (s not in unique):
            unique.append(s)
        row[columnIndex] = unique.index(s)

def change_columns_to_int(dataset, columns):
    for column in columns:
        change_str_to_index(dataset, column)


# Naiv Classifier

In [9]:
#separate data by class
#this function assumes that the last column of the table is the class
from math import sqrt
from math import exp
from math import pi

def separate_by_class(dataset):
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated


#count non nan elements
def len_nan(numbers):
    return (np.count_nonzero(~np.isnan(numbers)))


#claculate the mean of a list of numbers ignoring nan elements
def mean_nan(numbers, size):
    return np.nansum(numbers)/ size


#claculate the standard deviation of a list of numbers ignoring nan elements
def stdev_nan(numbers, mean, size):
    variance = np.nansum(np.array([(x - mean)**2 for x in numbers], dtype=np.float64)) / size
    return sqrt(variance)


#claculate the size of each column ignoring nan values
def get_header_size(classes):
    sizes = [0] * len(classes[0][0])
    for blob in classes:
        for i in range(len(classes[0][0])):
            sizes[i] += len_nan(get_column(classes[blob], i))
    return sizes

# summarise the data for each metric in the dataset
# claculate the mean, dev and count for each column
def summarize_dataset(dataset):
    out = []
    for column in zip(*dataset):
        arr = np.array(column, dtype=np.float64)
        size = len_nan(arr)
        mean = mean_nan(arr, size)
        dev = stdev_nan(arr, mean, size)
        out.append([mean, dev, size])
    return out

#split the dataset in classes based on the last column
#then substetute each class with the summarized values
def summarize_by_class(separated):
	summaries = dict()
	for class_value, rows in separated.items():
		summaries[class_value] = summarize_dataset(rows)
	return summaries


In [4]:
#P(class=0|X1,X2) = P(X1|class=0) * P(X2|class=0) * P(class=0)
# Calculate the probabilities of predicting each class for a given row

from math import sqrt
from math import pi
from math import exp
import scipy.stats

def calculate_class_probabilities(summaries, row):
    probabilities = dict()
    total_rows = sum([summaries[label][0][2] for label in summaries])
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2] / float(total_rows)
        for i in range(1,len(class_summaries), 1):
            mean, stdev, count = class_summaries[i]
            if (np.isnan(row[i])):
                proba = 1
            else:
                proba = scipy.stats.norm(mean, stdev).pdf(row[i])
            if (np.isnan(proba)):
                proba = 1
            #print(i, '=> ', proba)
            probabilities[class_value] *= proba
    return probabilities

def predict_naive_class(summaries, row):
    res = calculate_class_probabilities(summaries, row)
    if (res[0] >= res[1]):
        return (0)
    else:
        return (1)

# Calculate accuracy percentage
def calculate_accuracy(testSet, classifier):
    correct = 0
    length = len(testSet) / 10
    i = 0
    for row in testSet:
        if (predict_naive_class(classifier, row) == row[-1]):
            correct += 1
        i += 1
        if (i >= length):
            print('=', flush=True, end="")
            i = 0
    return correct / float(len(testSet)) * 100.0

# Tie All Together

In [5]:
#prep data
change_columns_to_int(trainArray, [13, 19, 24, 25, 27, 28, 30, 36, 39, 42, 53, 66, 88])

In [6]:
#split data to classes
classes = separate_by_class(trainArray)

In [7]:
#split int 80-20 for training-testing
import random
testSet = []

for index in classes:
    size = int(len(classes[index]) * 0.2)
    print(len(classes[index]), size)
    random.shuffle(classes[index])
    testSet.extend(classes[index][:size])
    classes[index] = classes[index][size:]
    print(len(classes[index]), size)
random.shuffle(testSet)

326264 65252
261012 65252
28925 5785
23140 5785


In [10]:
start = timer()
baseline = summarize_by_class(classes)
print("time:", timer()-start)  

time: 25.82422489998862


In [11]:
from timeit import default_timer as timer 

start = timer()
acc = calculate_accuracy(testSet[:100], baseline)
print("without GPU:", timer()-start)
print("accuracy = ", acc)

==

  x = np.asarray((x - loc)/scale, dtype=dtyp)
  x = np.asarray((x - loc)/scale, dtype=dtyp)


accuracy =  80.0
