In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [18]:
data_iris = load_iris()
df_iris = pd.DataFrame(data_iris.data, columns=data_iris.feature_names)
df_iris['target'] = data_iris.target
df_iris['target_names'] = df_iris['target'].apply(lambda x: data_iris.target_names[x])
#df_iris

In [3]:
#count the number of each target
df_iris['target_names'].value_counts()

setosa        50
versicolor    50
virginica     50
Name: target_names, dtype: int64

In [4]:
features = [col for col in df_iris.columns if col not in ["target", "target_names"]]
df_iris_features = df_iris[features]

In [5]:
training_features, testing_features, training_labels, testing_labels = train_test_split(df_iris_features, df_iris['target'], test_size=0.3)

In [6]:
# find mean and standard deviation for each feature
training_features.groupby(training_labels).agg(['mean', 'std'])

Unnamed: 0_level_0,sepal length (cm),sepal length (cm),sepal width (cm),sepal width (cm),petal length (cm),petal length (cm),petal width (cm),petal width (cm)
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,5.002703,0.357838,3.435135,0.40498,1.462162,0.191994,0.251351,0.109599
1,5.92,0.458771,2.754286,0.307115,4.251429,0.453335,1.317143,0.196268
2,6.569697,0.676149,2.981818,0.340454,5.542424,0.537953,1.987879,0.279237


In [7]:
mean_setosa, mean_versicolor, mean_virginica = training_features.groupby(training_labels).mean().values
std_setosa, std_versicolor, std_virginica = training_features.groupby(training_labels).std().values

In [8]:
def gaussian(x, mean, std):
    return 1/(np.sqrt(2*np.pi)*std)*np.exp(-((x-mean)**2)/(2*std**2))
    
def predict(sepal_length, sepal_width, petal_length, petal_width):
    return (1/3)*sepal_length*sepal_width*petal_length*petal_width

In [9]:
#setosa probability
p_setosa_sepal_length, p_setosa_sepal_width = gaussian(testing_features['sepal length (cm)'], mean_setosa[0], std_setosa[0]), gaussian(testing_features['sepal width (cm)'], mean_setosa[1], std_setosa[1])
p_setosa_petal_length, p_setosa_petal_width = gaussian(testing_features['petal length (cm)'], mean_setosa[2], std_setosa[2]), gaussian(testing_features['petal width (cm)'], mean_setosa[3], std_setosa[3])
p_setosa = predict(p_setosa_sepal_length, p_setosa_sepal_width, p_setosa_petal_length, p_setosa_petal_width)

#versicolor probability
p_versicolor_sepal_length, p_versicolor_sepal_width = gaussian(testing_features['sepal length (cm)'], mean_versicolor[0], std_versicolor[0]), gaussian(testing_features['sepal width (cm)'], mean_versicolor[1], std_versicolor[1])
p_versicolor_petal_length, p_versicolor_petal_width = gaussian(testing_features['petal length (cm)'], mean_versicolor[2], std_versicolor[2]), gaussian(testing_features['petal width (cm)'], mean_versicolor[3], std_versicolor[3])
p_versicolor = predict(p_versicolor_sepal_length, p_versicolor_sepal_width, p_versicolor_petal_length, p_versicolor_petal_width)

#virginica probability
p_virginica_sepal_length, p_virginica_sepal_width = gaussian(testing_features['sepal length (cm)'], mean_virginica[0], std_virginica[0]), gaussian(testing_features['sepal width (cm)'], mean_virginica[1], std_virginica[1])
p_virginica_petal_length, p_virginica_petal_width = gaussian(testing_features['petal length (cm)'], mean_virginica[2], std_virginica[2]), gaussian(testing_features['petal width (cm)'], mean_virginica[3], std_virginica[3])
p_virginica = predict(p_virginica_sepal_length, p_virginica_sepal_width, p_virginica_petal_length, p_virginica_petal_width)

In [14]:
df_ans = pd.DataFrame()
df_ans['p_setosa'] = p_setosa
df_ans['p_versicolor'] = p_versicolor
df_ans['p_virginica'] = p_virginica
df_ans['target_names'] = testing_labels.apply(lambda x: data_iris.target_names[x])
# what column has the highest probability
df_ans['predicted'] = df_ans[['p_setosa', 'p_versicolor', 'p_virginica']].idxmax(axis=1)
df_ans['predicted'] = df_ans['predicted'].apply(lambda x: x.split('_')[1])
df_ans['correct'] = df_ans['target_names'] == df_ans['predicted']
#df_ans

In [15]:
# tatal accuracy
from sklearn.metrics import accuracy_score
accuracy_percent = accuracy_score(df_ans['target_names'], df_ans['predicted']) * 100
print ("Accuracy ['NB from Scratch']: %5.2f%%" %accuracy_percent)

Accuracy: 95.56%


In [16]:
# use Guussian Naive Bayes from sklearn
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
model = classifier.fit(training_features, training_labels)
predictions_labels = model.predict(testing_features)
df_ans['GaussianNB'] = predictions_labels
df_ans['GaussianNB'] = df_ans['GaussianNB'].apply(lambda x: data_iris.target_names[x])
df_ans['correct_GaussianNB'] = df_ans['target_names'] == df_ans['GaussianNB']
accuracy_percent = accuracy_score(testing_labels, predictions_labels) * 100
print ("Accuracy ['sklearn.naive_bayes']: %5.2f%%" %accuracy_percent)
df_ans['check equal'] = df_ans['correct'] == df_ans['correct_GaussianNB']

Accuracy: 95.56%


In [17]:
if all(df_ans['check equal']):
    print('success')
else:
    print('fail')
df_ans

success


Unnamed: 0,p_setosa,p_versicolor,p_virginica,target_names,predicted,correct,GaussianNB,correct_GaussianNB,check equal
10,1.060811,2.858879e-18,4.055088e-24,setosa,setosa,True,setosa,True,True
120,3.970375e-188,5.217594e-10,0.09052417,virginica,virginica,True,virginica,True,True
6,1.259243,4.444048e-18,2.556077e-24,setosa,setosa,True,setosa,True,True
95,8.754129e-62,0.3624683,8.848839e-05,versicolor,versicolor,True,versicolor,True,True
25,1.076489,2.253605e-16,4.4911340000000005e-23,setosa,setosa,True,setosa,True,True
142,3.587562e-129,0.001351516,0.06153739,virginica,virginica,True,virginica,True,True
136,1.850088e-187,1.541309e-10,0.03548549,virginica,virginica,True,virginica,True,True
84,2.374202e-83,0.1434184,0.001815446,versicolor,versicolor,True,versicolor,True,True
145,1.355848e-163,4.628407e-08,0.104648,virginica,virginica,True,virginica,True,True
7,2.424107,9.142951e-18,5.323698e-24,setosa,setosa,True,setosa,True,True
