In [8]:
# Abbreviations:
# DS: data set
# df: data frame
import csv, random, math
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing

# Map string attributes to integer values
def map_stringAttributes(df, attr_hash, class_column):
    df_new = df.copy()
    classes = df_new[class_column].unique()
    for i in range (len(df.columns)):
        attr_name = df.columns[i]                      # attribute name
        value = df.iloc[0][attr_name]                  # first value in column
        if (isinstance(value, str) == True):           # check if value is a string
            attr_values = df_new[attr_name].unique()   # unique values for attribute
            # map integers for each unique attribute value
            map_to_int = {name: n for n, name in enumerate(attr_values)}
            # replace data in dataset with its respective int
            df_new[attr_name] = df_new[attr_name].replace(map_to_int)
            # store list of attribute values and their int mapping to attribute name
            attr_hash[attr_name] = list(enumerate(attr_values))
    return (df_new, attr_hash, classes)

# Split dataset into training and testing sets
def splitDataset(df, splitRatio):
    trainingSize = int(len(df.index) * splitRatio)
    trainingSet = []
    testingSet = list(df.index)  # shallow copy of dataframe
    while len(trainingSet) < trainingSize:
        # get a random row from testingSet
        index = random.randint(1, len(testingSet))-1
        # remove index row from copy and
        # add it to training set
        trainingSet.append(testingSet.pop(index))
    return (trainingSet, testingSet)

# Separate dataset by their class labels
def separateByClass(dataset):
    separated = {}
    for i in range (len(dataset)):
        row = dataset[i]
       # print("row:", df.iloc[row])
        # check if class label is in dictionary
        if (row[-1] not in separated):
            # empty entry for class label
            separated[row[-1]] = []
        # add row to its class label dataset
        separated[row[-1]].append(row)
    return separated

# Return mean of values
def mean(values):
    return sum(values)/float(len(values))

# Return standard deviation of values
def std_dev(values):
    avg = mean(values)
    variance = 0
    # calculate variance
    for i in range (0, len(values), 1):
        variance = values[i]**2
    variance = ( (variance/float((len(values)-1))) - (avg**2) )
    return math.sqrt(variance)

# Summarize dataset by calculating mean and
# standard deviation for each attribute
def summarize(df):
    import numpy as np
    summaries = [(np.mean(column), np.std(column)) for column in zip(*df)]
    #delete class label from summaries
    del summaries[-1]
    return summaries

# Summarize dataset by class labels
def summarizeByClass(df):
    separated = separateByClass(df)
    summaries = {}
    # separate summaries by class labels
    for classLabel, instance in separated.items():
        summaries[classLabel] = summarize(instance)
    return summaries

def convertStr(listX):
    for i in range(len(listX)):
        listX[i] = [float(x) for x in listX[i]]
    return listX

#calculate guassian probalbility
def calculate_guassian(value, mean , std):
    import math
    exp = math.exp(-(math.pow(value-mean,2)/(2*math.pow(std,2))))
    return (1/(math.sqrt(2*math.pi)* std)) * exp

# a dictionary of summaries  is passed,along with a list values
#for every class_summaries 
#make sure that probability is initialized with one to avoid probalilities of zero
#for every  value class summary 
#get the mean and std of their repective column 
#get their value for that column 
#calculate the guassian probability and iteratively multiply for each column for a tuple.
def class_probability(summaries, inputVector):
    probabilities = {}
    for classValue , classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, std = classSummaries[i]
            value = inputVector[i]
            probabilities[classValue] *= calculate_guassian(value,mean, std)
    return probabilities

#find class probabilities given class summaries and input vector
#find the class probability for each tuple
#pick the maxi-probability class for that tuple
def predict_class(summaries, inputVectors):
    probabilities = class_probability(summaries,inputVectors)
    bestLabel, bestProb = None, -1;
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

#Transform the data back to it normal form after splitD
def Transform(index, data):
    superList = []
    for i in range(0, len(data), 1):
        superList.append(list(data.iloc[i]))
    return superList
#get the class values predicted for
def getClass(class_summ, testSet):
    predictions = []
    for i in range(0,len(testSet), 1):
        predictX = predict_class(class_summ, testSet[i])
        predictions.append(predictX)
    return predictions

def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

def MakeUnseen(dataset):
    for i in range(0, len(dataset), 1):
        dataset[i][-1] = '?'
    return dataset   

# Data frames for their respective data set
adult_df = pd.read_csv("Adult_DS.csv")
bupa_df = pd.read_csv("bupa_DS.csv")

# Naive Bayes Classifier
# Create list of attribtues for each DS
adult_attributes = list(adult_df.columns[:14])
bupa_attributes = list(bupa_df.columns[:6])

adult_mapping = {}        # hash table for adult attributes
adult_df2, adult_mapping, adult_classes = map_stringAttributes(adult_df, adult_mapping, "annual-income")
bupa_classes = list(bupa_df["selector"].unique())    

# Test
splitRatio = 0.67
adult_trainingSet, adult_testingSet = splitDataset(adult_df2, splitRatio)
bupa_trainingSet, bupa_testingSet = splitDataset(bupa_df, splitRatio)

# training set data
adult_training = Transform(adult_trainingSet,adult_df2)
bupa_training = Transform(bupa_trainingSet, bupa_df)

#testing set data 
adult_testingWC = Transform(adult_testingSet,adult_df2)
bupa_testingWC = Transform(bupa_testingSet, bupa_df)

#testing set data >>>> this hold the data,to avoid variable name conflict 
adult_trans2 = Transform(adult_testingSet,adult_df2)
bupa_trans2 = Transform(bupa_testingSet, bupa_df)

#tesing set without a class label ; can print them to see what they are.
adult_testingNC = MakeUnseen(adult_trans2)
bupa_testingNC = MakeUnseen(bupa_trans2)

# Get class summaries for each class label
adult_class_sum = summarizeByClass(adult_training)
bupa_class_sum = summarizeByClass(bupa_training)

# Compute accuracies for datasets
adult_r = getClass(adult_class_sum, adult_testingNC)
adult_accuracy =  getAccuracy(adult_testingWC, adult_r)
bupa_r = getClass(bupa_class_sum, bupa_testingNC)
bupa_accuracy = getAccuracy(bupa_testingWC, bupa_r)


print("For Adult Dataset")
print("Accuracy from implementation: {}%".format(adult_accuracy), end="\n\n")

print("For BUPA Dataset")
print("Accuracy from implementation: {}%".format(bupa_accuracy), end="\n\n")


For Adult Dataset
Accuracy from implementation: 82.3039832928964%

For BUPA Dataset
Accuracy from implementation: 51.30434782608696%

