# Read Data

In [1]:
import pandas as pd
import numpy as np
import csv
from timeit import default_timer as timer
from numba import jit, njit
from numba.typed import List


df = pd.read_csv('bank_data_train.csv', skiprows=1)
trainArray = df.to_numpy()

file = open('bank_data_train.csv')
reader = csv.reader(file);
file.close()

df = pd.read_csv('bank_data_test.csv', skiprows=1)
finalTestingData = df.to_numpy()


In [2]:
def change_str_to_index(dataset, columnIndex):
    unique = list()
    for row in dataset:
        s = str(row[columnIndex])
        if (s == 'nan'):
            continue
        s = s.lower()
        if (s not in unique):
            unique.append(s)
        row[columnIndex] = unique.index(s)

def change_columns_to_int(dataset, columns):
    for column in columns:
        change_str_to_index(dataset, column)


# Naiv Classifier

In [3]:
#separate data by class
#this function assumes that the last column of the table is the class
from math import sqrt
from math import exp
from math import pi
from numba import jit, njit

#separate the data into classes according to the last value in each row
def separate_by_class(dataset):
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated


#count non nan elements
@njit
def len_nan(numbers):
    return (np.count_nonzero(~np.isnan(numbers)))


#claculate the mean of a list of numbers ignoring nan elements
@njit
def mean_nan(numbers, size):
    return np.nansum(numbers)/ size


#claculate the standard deviation of a list of numbers ignoring nan elements
@njit
def stdev_nan(numbers, mean, size):
    variance = np.nansum(np.array([(x - mean)**2 for x in numbers], dtype=np.float64)) / size
    return sqrt(variance)

#get column of 2d array
@jit
def get_column(matrix, i):
    return [row[i] for row in matrix]


# Calculate the Gaussian probability distribution function for x
@njit
def normal(mean, stdev, val):
    exponent = exp(-((val-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent


# summarise the data for each metric in the dataset
# claculate the mean, dev and count for each column
def summarize_dataset(dataset, n):
    i = 0
    sizes = np.zeros(n, dtype = np.float64)
    means = np.zeros(n, dtype = np.float64)
    devs = np.zeros(n, dtype = np.float64)
    for column in zip(*dataset):
        arr = np.array(column, dtype=np.float64)
        size = len_nan(arr)
        mean = mean_nan(arr, size)
        dev = stdev_nan(arr, mean, size)
        sizes[i] = size
        means[i] = mean
        devs[i] = dev
        i += 1
    return (sizes, means, devs)

#substitute each class with the summarized values
def summarize_each_class(separated):
    summaries = dict()
    n = len(separated[0][0])
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows, n)
    return summaries


In [20]:
#P(class=0|X1,X2) = P(X1|class=0) * P(X2|class=0) * P(class=0)
# Calculate the probabilities of row belonging to a class
@njit
def calculate_class_probability(row, sizes, means, devs):
    proba = 1.0
    for i in range(1, len(row), 1):
        if (devs[i] == 0):
            continue
        tmp = normal(means[i], devs[i], row[i])
        if (np.isnan(tmp)):
            tmp = 1.0
        proba *= float(tmp)
    return proba

#Calculate the porbabilities of a row belonging to each class
#then return the class with the bigge
def estimate_class(baseline, row):
    arow = np.array(row, dtype=np.float64)
    p1 = calculate_class_probability(arow, baseline[0][0], baseline[0][1], baseline[0][2])
    p2 = calculate_class_probability(arow, baseline[1][0], baseline[1][1], baseline[1][2])
    total_rows = baseline[0,0,0] + baseline[0,1,0]
    p1 *= (baseline[0,0,0] / total_rows)
    p2 *= (baseline[0,1,0] / total_rows)
    if (p1 >= p2):
        return 0
    else:
        return 1
    
def calculate_accuracy(baseline, rows):
    n = len(rows)
    v = 0
    for row in rows:
        c = estimate_class(baseline, row)
        if (c == row[-1]):
            v += 1
    return (float(v/n))

# Tie All Together
## Prep the Data

In [5]:
#prep data
change_columns_to_int(trainArray, [13, 19, 24, 25, 27, 28, 30, 36, 39, 42, 53, 66, 88])

In [6]:
#split data to classes
classes = separate_by_class(trainArray)

## Split the Data into 80-20 from each class
use the 20% as a test for calculating accuracy 

In [12]:
#split int 80-20 for training-testing
import random
testSet = []

for index in classes:
    size = int(len(classes[index]) * 0.2)
    print(len(classes[index]), size)
    random.shuffle(classes[index])
    testSet.extend(classes[index][:size])
    classes[index] = classes[index][size:]
    print(len(classes[index]), size)
random.shuffle(testSet)
print(len(testSet))

133639 26727
106912 26727
11848 2369
9479 2369
29096


## Calculate the BaseLine classifier

In [13]:
start = timer()
baseline = summarize_each_class(classes)
print("time:", timer()-start)  

time: 7.125244199996814


## Test the accuracy of the classifier

In [21]:
start = timer()
acc = calculate_accuracy(baseline, testSet)
print("time: ", timer()-start)
print("accuracy: ", acc)

2


TypeError: len() of unsized object

In [17]:
print(len(testSet))

29096


In [18]:
print(testSet[0])

[442778 0 0.0 0.0 nan 0.0 0.0 0.0 nan nan 0.0 0.0 nan nan 0.0 0.0 0.0 nan
 0.0 nan 0.0 0 18789.707666666694 nan nan nan 0.0 6691 nan 0.0 nan nan nan
 0.0 0.0 0.0 nan 0.0 1 nan 0 nan nan 0.0 0.666666666666667
 0.9882966183942192 0.0 384 nan 0 nan 0.0 0.0 nan nan nan nan 0 nan nan
 nan nan 0.120003719057329 0.0 nan nan nan 0.0 0.579194218615039 nan
 0.0002730060175267 0.0 nan 0.0 1.0 nan 0.0 1.0 0.0 nan 45.0 nan 45.0 nan
 0.0 nan 0.0 0.986207705776588 6 nan nan 6.02301181053365 0.0 0.0 nan 0.0
 nan 0.9882966183942192 nan 0.666666666666667 nan 0.0 1.0 0.0 nan
 0.0138050043140638 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0]
