In [5]:
from numpy import log2 as log
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
eps = np.finfo(float).eps
import pprint

dataset = pd.read_csv('Bank_dataset.csv',usecols=['job', 'marital', 'education', 'housing', 'y'], delimiter=';')
dataPercentage = int(input("Enter the percentage of data to be read: "))
# calculate the number of rows to be read
rows = int((dataPercentage/100)*len(dataset))
# read only the required number of rows
dataset = dataset[:rows]
# divide the dataset into training and testing sets in 80:20 ratio
train, test = train_test_split(dataset, test_size=0.2)


In [6]:
# Calculate the prior probabilities
def priorProbabilities(train):
    yes = train[train['y'] == 'yes']
    no = train[train['y'] == 'no']
    yesProb = len(yes) / len(train)
    noProb = len(no) / len(train)
    return yesProb, noProb

#create a dictionary of conditional probabilities
def conditionalProbabilities(train):
    # get unique values for each feature
    features = {}
    for feature in train:
        features[feature] = train[feature].unique()

    # initialize conditional probabilities dictionary
    conditionalProb = {}
    for feature in features:
        conditionalProb[feature] = {}
        for value in features[feature]:
            conditionalProb[feature][value] = {"yes": {"count": 0, "prob": 0}, "no": {"count": 0, "prob": 0}}

    # calculate the conditional probabilities
    for feature in features:
        for value in features[feature]:
            yes = train[train['y'] == 'yes']
            no = train[train['y'] == 'no']
            for i in range(len(yes)):
                if yes[feature].iloc[i] == value:
                    conditionalProb[feature][value]["yes"]["count"] += 1
            for i in range(len(no)):
                if no[feature].iloc[i] == value:
                    conditionalProb[feature][value]["no"]["count"] += 1

    # calculate probabilities of each value given the output class
    for feature in features:
        for value in features[feature]:
            yes_count = sum([conditionalProb[feature][v]["yes"]["count"] for v in features[feature]])
            no_count = sum([conditionalProb[feature][v]["no"]["count"] for v in features[feature]])
            conditionalProb[feature][value]["yes"]["prob"] = conditionalProb[feature][value]["yes"]["count"] / yes_count
            conditionalProb[feature][value]["no"]["prob"] = conditionalProb[feature][value]["no"]["count"] / no_count
    return conditionalProb

# predict the output class for the test set
def naive_bayes():
    yesProb, noProb = priorProbabilities(train)
    condProb = conditionalProbabilities(train)
    predictions = []
    for i in range(len(test)):
        testFeatures = test.iloc[i, :-1]  # get the feature values for the test example
        yesPosterior = yesProb  # start with the prior probability for 'yes'
        noPosterior = noProb    # start with the prior probability for 'no'
        for feature, value in zip(testFeatures.index, testFeatures.values):
            yesPosterior *= condProb[feature][value]['yes']['prob']  # multiply by the conditional probability for 'yes'
            noPosterior *= condProb[feature][value]['no']['prob']    # multiply by the conditional probability for 'no'
        if yesPosterior > noPosterior:
            predictions.append('yes')
        else:
            predictions.append('no')
    return predictions

NB_Predictions = naive_bayes()
# print(NB_Predictions)


In [7]:
def find_entropy(dataset):
# calculate entropy of the dataset as a whole
    Class = dataset.keys()[-1] #output class -> yes or no
    entropy = 0
    values = dataset[Class].unique()
    for value in values:
        fraction = dataset[Class].value_counts()[value]/len(dataset[Class])
        entropy += -fraction*np.log2(fraction)
    return entropy

# calculate entropy of a specific attribute
def find_entropy_attribute(dataset, attribute):
    Class = dataset.keys()[-1] #output class 
    target_variables = dataset[Class].unique()  # This gives all 'Yes' and 'No'
    # This gives different features in that attribute (like 'Hot','Cold' in Temperature)
    variables = dataset[attribute].unique()
    entropy2 = 0
    # calculate entropy for each value in the attribute
    for variable in variables:
        entropy = 0
        for target_variable in target_variables:
            num = len(dataset[attribute][dataset[attribute] == variable][dataset[Class] == target_variable])
            den = len(dataset[attribute][dataset[attribute] == variable])
            fraction = num/(den+eps)
            entropy += -fraction*log(fraction+eps) #enropy for one feature value
        fraction2 = den/len(dataset)
        entropy2 += -fraction2*entropy #entropy for the whole attribute
    return abs(entropy2)


def find_winner(dataset):
    IG = []
    for key in dataset.keys()[:-1]:
        #         Entropy_att.append(find_entropy_attribute(dataset,key))
        IG.append(find_entropy(dataset) - find_entropy_attribute(dataset, key))
    return dataset.keys()[:-1][np.argmax(IG)]

#subtable is the subset of the data where the attribute value is the same
def get_subtable(dataset, node, value):
    return dataset[dataset[node] == value].reset_index(drop=True)


def buildTree(dataset, tree=None, max_depth=2):
    Class = dataset.keys()[-1]

    # Here we build our decision tree

    # Check if all instances in the subtable have the same class label
    if len(dataset[Class].unique()) == 1:
        return dataset[Class].iloc[0]

    # Check if maximum depth is reached
    if max_depth == 0:
        # Determine the majority class label in the current subset
        majority_class = dataset[Class].value_counts().idxmax()
        return majority_class

    # Get attribute with maximum information gain
    node = find_winner(dataset)

    # Get distinct values of that attribute
    attValues = np.unique(dataset[node])

    # Create an empty dictionary to represent the tree
    if tree is None:
        tree = {}
        tree[node] = {}  # This is the root node

    # Iterate over the attribute values and recursively build the tree
    for value in attValues:
        # Get subtable for the current value of the attribute
        subtable = get_subtable(dataset, node, value)
        # Check if the subset is empty
        if subtable.empty:
            # Determine the majority class label in the current subset
            majority_class = dataset[Class].value_counts().idxmax()
            tree[node][value] = majority_class
        else:
            # Decrement the maximum depth by 1
            sub_max_depth = max_depth - 1
            # Recursively build the tree using the subtable
            tree[node][value] = buildTree(subtable, max_depth=sub_max_depth)

    return tree

def predict(inst,tree):
    #This function is used to predict for any input variable     
    #Recursively we go through the tree that we built earlier
    
    for nodes in tree.keys(): # job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome  
        value = inst[nodes] #if node is job, value could be admin
        tree = tree[nodes][value] #get the decision or sub-tree that corresponds to that value of the node
        prediction = 0

        if type(tree) is dict:
            prediction = predict(inst, tree)
        else:
            prediction = tree
            break;                            
        
    return prediction

#build our tree and call predict to test it
tree = buildTree(dataset, max_depth=5)
# print the tree
# pprint.pprint(tree)

#predict for the whole test data
DS_Predictions = dataset.apply(predict, args=(tree,), axis=1)
print(DS_Predictions)
print("_________________")



0       no
1       no
2       no
3       no
4       no
        ..
4063    no
4064    no
4065    no
4066    no
4067    no
Length: 4068, dtype: object
_________________


In [8]:
# calculate the accuracy for each model 
def get_accuracy(predictions):
    correct = 0
    for i in range(len(test)):
        if test['y'].iloc[i] == predictions[i]:
            correct += 1
    accuracy = correct / len(test)
    return accuracy
# print the accuracy percentage
print("Model Accuraccy when Reading", dataPercentage,"% of the DataSet")
print("Naive Bayse Accuracy: ", get_accuracy(NB_Predictions)*100,"%")
print("Decision Tree Accuracy: ", get_accuracy(DS_Predictions)*100,"%")



Model Accuraccy when Reading 90 % of the DataSet
Naive Bayse Accuracy:  88.6977886977887 %
Decision Tree Accuracy:  87.83783783783784 %
