In [1]:
import pandas as pd
import numpy as np
import csv
import os

In [2]:
inputFileName = "input.csv"
outputFileName = os.path.splitext(inputFileName)[0] + "_modified.csv"   #Modified file will be named accordingly

with open(inputFileName, newline='') as inFile, open(outputFileName, 'w', newline='') as outfile:
    r = csv.reader(inFile)  #Read file
    w = csv.writer(outfile) #Write file

    next(r, None)  #Skips the header
    w.writerow(['index', 'predictor1', 'predictor2', 'predictor3', 'response']) #New header
    #New header is in the form that the first column is the index, the last column is the response and between 3 columns are predictors
    #predictor1 is the first predictor column, predictor2 is the second predictor column, predictor3 is the third predictor column

    for row in r: #Rest of the data is written into the modified file
        w.writerow(row)

In [3]:
# The data here should always be split into train data first and then test data

In [4]:
df = pd.read_csv('input_modified.csv', header = 0)
len_df = len(df.index)  #Length of full dataset

for i in range(len_df):
    test_index = df["response"].isnull().sum()  #Count all the number of test values
test_index

10

In [5]:
train_index = len_df - test_index   # train index = total length - test index
print(train_index)

10


In [6]:
data = pd.read_csv('input_modified.csv', header = 0, nrows = train_index) #First 14 rows are training rows, rest are testing
data

Unnamed: 0,index,predictor1,predictor2,predictor3,response
0,1,Yes,Single,125k,No
1,2,No,Married,100k,No
2,3,No,Single,70k,No
3,4,Yes,Married,120k,No
4,5,No,Divorced,95k,Yes
5,6,No,Married,60k,No
6,7,Yes,Divorced,220k,No
7,8,No,Single,85k,Yes
8,9,No,Married,75k,No
9,10,No,Single,90k,Yes


In [7]:
predictor1_unique_temp = data.predictor1.unique()   #Taking all the unique items in the first column and storing it
predictor1_unique = predictor1_unique_temp.tolist() #Converting it into a list from a numpy array
len_pred1 = len(predictor1_unique)  #Calculating the number of variables in the list for looping
print(predictor1_unique)

['Yes', 'No']


In [8]:
predictor2_unique_temp = data.predictor2.unique()   #Taking all the unique items in the second column and storing it
predictor2_unique = predictor2_unique_temp.tolist()
len_pred2 = len(predictor2_unique)
print(predictor2_unique)

['Single', 'Married', 'Divorced']


In [9]:
predictor3_unique_temp = data.predictor3.unique()   #Taking all the unique items in the third column and storing it
predictor3_unique = predictor3_unique_temp.tolist()
len_pred3 = len(predictor3_unique)
print(predictor3_unique)

['125k', '100k', '70k', '120k', '95k', '60k', '220k', '85k', '75k', '90k']


In [10]:
#Generating a frequency table for predictor 1
predictor1_table = pd.crosstab(index = data["predictor1"], columns = data["response"], margins = True)
predictor1_table

response,No,Yes,All
predictor1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,4,3,7
Yes,3,0,3
All,7,3,10


In [11]:
#Generating a frequency table for predictor 2
predictor2_table = pd.crosstab(index = data["predictor2"], columns = data["response"], margins = True)
predictor2_table

response,No,Yes,All
predictor2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Divorced,1,1,2
Married,4,0,4
Single,2,2,4
All,7,3,10


In [12]:
#Generating a frequency table for predictor 3
predictor3_table = pd.crosstab(index = data["predictor3"], columns = data["response"], margins = True)
predictor3_table

response,No,Yes,All
predictor3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100k,1,0,1
120k,1,0,1
125k,1,0,1
220k,1,0,1
60k,1,0,1
70k,1,0,1
75k,1,0,1
85k,0,1,1
90k,0,1,1
95k,0,1,1


In [13]:
#All the calculations will be based on these frequency tables 
# .loc is used to find the value that is present at that location based on text

In [14]:
P_predictor_Yes = predictor1_table.loc["All"]["Yes"] / predictor1_table.loc["All"]["All"]   #Calculating the probabiuty of yes
P_predictor_No = predictor1_table.loc["All"]["No"] / predictor1_table.loc["All"]["All"]     #Calculating the probabiuty of no

In [15]:
predictor1_values = ()

for i in predictor1_unique: #This loop is to get the predictition values of all the unique parameters
    predictor1_values = predictor1_values + (predictor1_table.loc[i[0:]]["All"] / predictor1_table.loc["All"]["All"], )

predictor1_values
#Output is probability of Sunny, Overcast and Rain in this case of dataset

(0.3, 0.7)

In [16]:
predictor1_yes_values = ()  #Storing values of the predictions if it is Yes
predictor1_no_values = ()   #Storing values of the predictions if it is No

for i in predictor1_unique:
    predictor1_yes_values = predictor1_yes_values + (predictor1_table.loc[i[0:]]["Yes"] / predictor1_table.loc["All"]["Yes"], )
    predictor1_no_values = predictor1_no_values + (predictor1_table.loc[i[0:]]["No"] / predictor1_table.loc["All"]["No"], )

print(predictor1_yes_values, predictor1_no_values)
#Output is probability of Sunny, Overcast and Rain in this case of dataset

(0.0, 1.0) (0.42857142857142855, 0.5714285714285714)


In [17]:
yes_predictor1_values = ()  #Storing values of Bayes Theorem in case of Yes
no_predictor1_values = ()   #Storing values of Bayes Theorem in case of No

for i in range(len_pred1):
    yes_predictor1_values = yes_predictor1_values + (predictor1_yes_values[i] * P_predictor_Yes / predictor1_values[i],)
    no_predictor1_values = no_predictor1_values + (predictor1_no_values[i] * P_predictor_No / predictor1_values[i],)

print(yes_predictor1_values, no_predictor1_values)
#Output is probability of Sunny, Overcast and Rain in this case of dataset

(0.0, 0.4285714285714286) (1.0, 0.5714285714285714)


In [18]:
predictor2_values = ()

for i in predictor2_unique: #This loop is to get the predictition values of all the unique parameters
    predictor2_values = predictor2_values + (predictor2_table.loc[i[0:]]["All"] / predictor2_table.loc["All"]["All"], )

predictor2_values
#Output is probability of High and Normal in this case of dataset

(0.4, 0.4, 0.2)

In [19]:
predictor2_yes_values = ()  #Storing values of the predictions if it is Yes
predictor2_no_values = ()   #Storing values of the predictions if it is No

for i in predictor2_unique:
    predictor2_yes_values = predictor2_yes_values + (predictor2_table.loc[i[0:]]["Yes"] / predictor2_table.loc["All"]["Yes"], )
    predictor2_no_values = predictor2_no_values + (predictor2_table.loc[i[0:]]["No"] / predictor2_table.loc["All"]["No"], )

print(predictor2_yes_values, predictor2_no_values)
#Output is probability of High and Normal in this case of dataset

(0.6666666666666666, 0.0, 0.3333333333333333) (0.2857142857142857, 0.5714285714285714, 0.14285714285714285)


In [20]:
yes_predictor2_values = ()  #Storing values of Bayes Theorem in case of Yes
no_predictor2_values = ()   #Storing values of Bayes Theorem in case of No

for i in range(len_pred2):
    yes_predictor2_values = yes_predictor2_values + (predictor2_yes_values[i] * P_predictor_Yes / predictor2_values[i],)
    no_predictor2_values = no_predictor2_values + (predictor2_no_values[i] * P_predictor_No / predictor2_values[i],)

print(yes_predictor2_values, no_predictor2_values)
#Output is probability of High and Normal in this case of dataset

(0.49999999999999994, 0.0, 0.49999999999999994) (0.49999999999999994, 0.9999999999999999, 0.49999999999999994)


In [21]:
predictor3_values = ()

for i in predictor3_unique: #This loop is to get the predictition values of all the unique parameters
    predictor3_values = predictor3_values + (predictor3_table.loc[i[0:]]["All"] / predictor3_table.loc["All"]["All"], )

predictor3_values
#Output is probability of Weak and Strong in this case of dataset

(0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1)

In [22]:
predictor3_yes_values = ()  #Storing values of the predictions if it is Yes
predictor3_no_values = ()   #Storing values of the predictions if it is No

for i in predictor3_unique:
    predictor3_yes_values = predictor3_yes_values + (predictor3_table.loc[i[0:]]["Yes"] / predictor3_table.loc["All"]["Yes"], )
    predictor3_no_values = predictor3_no_values + (predictor3_table.loc[i[0:]]["No"] / predictor3_table.loc["All"]["No"], )

print(predictor3_yes_values, predictor3_no_values)
#Output is probability of Weak and Strong in this case of dataset

(0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.0, 0.0, 0.3333333333333333, 0.0, 0.3333333333333333) (0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.0, 0.14285714285714285, 0.14285714285714285, 0.0, 0.14285714285714285, 0.0)


In [23]:
yes_predictor3_values = ()  #Storing values of Bayes Theorem in case of Yes
no_predictor3_values = ()   #Storing values of Bayes Theorem in case of No

for i in range(len_pred3):
    yes_predictor3_values = yes_predictor3_values + (predictor3_yes_values[i] * P_predictor_Yes / predictor3_values[i],)
    no_predictor3_values = no_predictor3_values + (predictor3_no_values[i] * P_predictor_No / predictor3_values[i],)

print(yes_predictor3_values, no_predictor3_values)
#Output is probability of Weak and Strong in this case of dataset

(0.0, 0.0, 0.0, 0.0, 0.9999999999999999, 0.0, 0.0, 0.9999999999999999, 0.0, 0.9999999999999999) (0.9999999999999999, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, 0.0, 0.9999999999999999, 0.9999999999999999, 0.0, 0.9999999999999999, 0.0)


In [24]:
#The first 14 rows are used to train the data, the rest of the data is used in testing it
test = pd.read_csv('input_modified.csv', header = None, skiprows = (train_index + 1))
#Changing the headers to match the training dataset header
test.columns = ['index', 'predictor1', 'predictor2', 'predictor3', 'response']
test

Unnamed: 0,index,predictor1,predictor2,predictor3,response
0,11,Yes,Single,125k,
1,12,No,Married,100k,
2,13,No,Single,70k,
3,14,Yes,Married,120k,
4,15,No,Divorced,95k,
5,16,No,Married,60k,
6,17,Yes,Divorced,220k,
7,18,No,Single,85k,
8,19,No,Married,75k,
9,20,No,Single,90k,


In [25]:
rows = len(test.index)
#Calculating the number of rows of the testing dataset
rows

10

In [26]:
for i in range(rows):
    #Running a loop though each and every row and calculating the prediction by Naive Bayes

    #Naive Bayes in case of yes
    test1_yes = predictor1_yes_values[predictor1_unique.index(test.iloc[i].predictor1)] * predictor2_yes_values[predictor2_unique.index(test.iloc[i].predictor2)] * predictor3_yes_values[predictor3_unique.index(test.iloc[i].predictor3)] * P_predictor_Yes

    #Naive Bayes in case of no
    test1_no = predictor1_no_values[predictor1_unique.index(test.iloc[i].predictor1)] * predictor2_no_values[predictor2_unique.index(test.iloc[i].predictor2)] * predictor3_no_values[predictor3_unique.index(test.iloc[i].predictor3)] * P_predictor_No

    y = (test1_yes / (test1_yes + test1_no))    #Prediction of yes in a row
    n = (test1_no / (test1_yes + test1_no))     #Prediction of no in a row

    print(y, n)

    #Modifying the dataframe and inserting 1 or 0 in place of NaN values
    if y > n:
        test.at[i, "response"] = 1  # 1 means yes
    elif n > y:
        test.at[i, "response"] = 0 # 0 means no

0.0 1.0
0.0 1.0
0.0 1.0
0.0 1.0
1.0 0.0
0.0 1.0
0.0 1.0
1.0 0.0
0.0 1.0
1.0 0.0


In [27]:
test
#Printing the final result

Unnamed: 0,index,predictor1,predictor2,predictor3,response
0,11,Yes,Single,125k,0.0
1,12,No,Married,100k,0.0
2,13,No,Single,70k,0.0
3,14,Yes,Married,120k,0.0
4,15,No,Divorced,95k,1.0
5,16,No,Married,60k,0.0
6,17,Yes,Divorced,220k,0.0
7,18,No,Single,85k,1.0
8,19,No,Married,75k,0.0
9,20,No,Single,90k,1.0


In [28]:
#Since the test data is the train data withouot the actual answer, we can compare the results of both and find the accuracy

test_list = ()
train_list = ()

# Making the train and test data response to 1 or 0 to make it easier to compare

for i in range(train_index):
    if data.response[i] == "Yes":
        train_list = train_list + (1, )
    else:
        train_list = train_list + (0, )

for i in range(test_index):
    if test.response[i] == 1.0:
        test_list = test_list + (1, )
    else:
        test_list = test_list + (0, ) 

print("Train list - ", train_list)
print("Test list - ", test_list)

Train list -  (0, 0, 0, 0, 1, 0, 0, 1, 0, 1)
Test list -  (0, 0, 0, 0, 1, 0, 0, 1, 0, 1)


In [29]:
# Comparing the data of test and train to find the accuracy
correct = 0
for i in range(len(test_list)):
	if test_list[i] == train_list[i]:
		correct += 1

accuracy = correct / len(test_list) * 100
print("Accuracy =", accuracy)

Accuracy = 100.0
