In [112]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [113]:
data_iris = load_iris()
df_iris = pd.DataFrame(data_iris.data, columns=data_iris.feature_names)
df_iris['target'] = data_iris.target
df_iris['target_names'] = df_iris['target'].apply(lambda x: data_iris.target_names[x])
df_iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica


In [114]:
#count the number of each target
df_iris['target_names'].value_counts()

setosa        50
versicolor    50
virginica     50
Name: target_names, dtype: int64

In [115]:
features = [col for col in df_iris.columns if col not in ["target", "target_names"]]
df_iris_features = df_iris[features]

In [116]:
training_features, testing_features, training_labels, testing_labels = train_test_split(df_iris_features, df_iris['target'], test_size=0.3)

In [117]:
# find mean and standard deviation for each feature
training_features.groupby(training_labels).agg(['mean', 'std'])

Unnamed: 0_level_0,sepal length (cm),sepal length (cm),sepal width (cm),sepal width (cm),petal length (cm),petal length (cm),petal width (cm),petal width (cm)
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,4.986111,0.336497,3.441667,0.360456,1.491667,0.169664,0.252778,0.118288
1,5.954054,0.503099,2.740541,0.321852,4.283784,0.472296,1.332432,0.209568
2,6.609375,0.574377,3.01875,0.306318,5.60625,0.558187,2.05625,0.258953


In [118]:
# find mean and standard deviation for setosa
mean_setosa = training_features[training_labels == 0].mean()
std_setosa = training_features[training_labels == 0].std()

# find mean and standard deviation for versicolor
mean_versicolor = training_features[training_labels == 1].mean()
std_versicolor = training_features[training_labels == 1].std()

# find mean and standard deviation for virginica
mean_virginica = training_features[training_labels == 2].mean()
std_virginica = training_features[training_labels == 2].std()


In [119]:
#setosa probability
p_setosa_sepal_length = (1/(std_setosa[0]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['sepal length (cm)']-mean_setosa[0])**2)/(2*std_setosa[0]**2))
p_setosa_sepal_width = (1/(std_setosa[1]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['sepal width (cm)']-mean_setosa[1])**2)/(2*std_setosa[1]**2))
p_setosa_petal_length = (1/(std_setosa[2]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['petal length (cm)']-mean_setosa[2])**2)/(2*std_setosa[2]**2))
p_setosa_petal_width = (1/(std_setosa[3]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['petal width (cm)']-mean_setosa[3])**2)/(2*std_setosa[3]**2))
p_setosa = (1/3)*p_setosa_sepal_length*p_setosa_sepal_width*p_setosa_petal_length*p_setosa_petal_width

In [120]:
#versicolor probability
p_versicolor_sepal_length = (1/(std_versicolor[0]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['sepal length (cm)']-mean_versicolor[0])**2)/(2*std_versicolor[0]**2))
p_versicolor_sepal_width = (1/(std_versicolor[1]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['sepal width (cm)']-mean_versicolor[1])**2)/(2*std_versicolor[1]**2))
p_versicolor_petal_length = (1/(std_versicolor[2]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['petal length (cm)']-mean_versicolor[2])**2)/(2*std_versicolor[2]**2))
p_versicolor_petal_width = (1/(std_versicolor[3]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['petal width (cm)']-mean_versicolor[3])**2)/(2*std_versicolor[3]**2))
p_versicolor = (1/3)*p_versicolor_sepal_length*p_versicolor_sepal_width*p_versicolor_petal_length*p_versicolor_petal_width

In [121]:
#virginica probability
p_virginica_sepal_length = (1/(std_virginica[0]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['sepal length (cm)']-mean_virginica[0])**2)/(2*std_virginica[0]**2))
p_virginica_sepal_width = (1/(std_virginica[1]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['sepal width (cm)']-mean_virginica[1])**2)/(2*std_virginica[1]**2))  
p_virginica_petal_length = (1/(std_virginica[2]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['petal length (cm)']-mean_virginica[2])**2)/(2*std_virginica[2]**2))
p_virginica_petal_width = (1/(std_virginica[3]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['petal width (cm)']-mean_virginica[3])**2)/(2*std_virginica[3]**2))
p_virginica = (1/3)*p_virginica_sepal_length*p_virginica_sepal_width*p_virginica_petal_length*p_virginica_petal_width

In [122]:
df_ans = pd.DataFrame()
df_ans['p_setosa'] = p_setosa
df_ans['p_versicolor'] = p_versicolor
df_ans['p_virginica'] = p_virginica
df_ans['target_names'] = testing_labels.apply(lambda x: data_iris.target_names[x])
# what column has the highest probability
df_ans['predicted'] = df_ans[['p_setosa', 'p_versicolor', 'p_virginica']].idxmax(axis=1)
df_ans['predicted'] = df_ans['predicted'].apply(lambda x: x.split('_')[1])
df_ans['correct'] = df_ans['target_names'] == df_ans['predicted']
df_ans

Unnamed: 0,p_setosa,p_versicolor,p_virginica,target_names,predicted,correct
50,3.893125e-106,0.01410465,0.002386116,versicolor,versicolor,True
16,0.1766279,4.727952e-17,8.999185999999999e-26,setosa,setosa,True
95,9.544e-71,0.2701664,1.673457e-05,versicolor,versicolor,True
106,1.860352e-102,0.008584075,5.143374e-05,virginica,versicolor,False
101,1.0553639999999999e-142,0.002861606,0.03955125,virginica,virginica,True
49,2.509739,7.054189e-17,1.395237e-26,setosa,setosa,True
79,1.9386739999999998e-41,0.03023218,7.351586e-09,versicolor,versicolor,True
84,2.9816260000000003e-93,0.135785,0.0005044074,versicolor,versicolor,True
0,2.529174,2.8223630000000005e-17,9.931213e-27,setosa,setosa,True
47,1.122099,1.861942e-17,1.9903660000000003e-27,setosa,setosa,True


In [123]:
# tatal accuracy
from sklearn.metrics import accuracy_score
accuracy_percent = accuracy_score(df_ans['target_names'], df_ans['predicted']) * 100
print ("Accuracy: %5.2f%%" %accuracy_percent)

Accuracy: 91.11%


In [124]:
# use Guussian Naive Bayes from sklearn
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
model = classifier.fit(training_features, training_labels)
predictions_labels = model.predict(testing_features)
df_ans['GaussianNB'] = predictions_labels
df_ans['GaussianNB'] = df_ans['GaussianNB'].apply(lambda x: data_iris.target_names[x])
df_ans['correct_GaussianNB'] = df_ans['target_names'] == df_ans['GaussianNB']
accuracy_percent = accuracy_score(testing_labels, predictions_labels) * 100
print ("Accuracy: %5.2f%%" %accuracy_percent)
df_ans['check equal'] = df_ans['correct'] == df_ans['correct_GaussianNB']

Accuracy: 91.11%


In [126]:
if all(df_ans['check equal']):
    print('success')
df_ans

success


Unnamed: 0,p_setosa,p_versicolor,p_virginica,target_names,predicted,correct,GaussianNB,correct_GaussianNB,check equal
50,3.893125e-106,0.01410465,0.002386116,versicolor,versicolor,True,versicolor,True,True
16,0.1766279,4.727952e-17,8.999185999999999e-26,setosa,setosa,True,setosa,True,True
95,9.544e-71,0.2701664,1.673457e-05,versicolor,versicolor,True,versicolor,True,True
106,1.860352e-102,0.008584075,5.143374e-05,virginica,versicolor,False,versicolor,False,True
101,1.0553639999999999e-142,0.002861606,0.03955125,virginica,virginica,True,virginica,True,True
49,2.509739,7.054189e-17,1.395237e-26,setosa,setosa,True,setosa,True,True
79,1.9386739999999998e-41,0.03023218,7.351586e-09,versicolor,versicolor,True,versicolor,True,True
84,2.9816260000000003e-93,0.135785,0.0005044074,versicolor,versicolor,True,versicolor,True,True
0,2.529174,2.8223630000000005e-17,9.931213e-27,setosa,setosa,True,setosa,True,True
47,1.122099,1.861942e-17,1.9903660000000003e-27,setosa,setosa,True,setosa,True,True
