In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [2]:
data_iris = load_iris()
df_iris = pd.DataFrame(data_iris.data, columns=data_iris.feature_names)
df_iris['target'] = data_iris.target
df_iris['target_names'] = df_iris['target'].apply(lambda x: data_iris.target_names[x])
df_iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica


In [3]:
#count the number of each target
df_iris['target_names'].value_counts()

setosa        50
versicolor    50
virginica     50
Name: target_names, dtype: int64

In [4]:
features = [col for col in df_iris.columns if col not in ["target", "target_names"]]
df_iris_features = df_iris[features]

In [5]:
training_features, testing_features, training_labels, testing_labels = train_test_split(df_iris_features, df_iris['target'], test_size=0.3)

In [6]:
# find mean and standard deviation for each feature
training_features.groupby(training_labels).agg(['mean', 'std'])

Unnamed: 0_level_0,sepal length (cm),sepal length (cm),sepal width (cm),sepal width (cm),petal length (cm),petal length (cm),petal width (cm),petal width (cm)
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,4.993939,0.359635,3.409091,0.417854,1.463636,0.183402,0.248485,0.114895
1,5.9475,0.528659,2.795,0.313745,4.3075,0.484841,1.3425,0.204923
2,6.5875,0.535664,2.984375,0.246406,5.584375,0.454447,2.059375,0.240777


In [7]:
# find mean and standard deviation for setosa
mean_setosa = training_features[training_labels == 0].mean()
std_setosa = training_features[training_labels == 0].std()

# find mean and standard deviation for versicolor
mean_versicolor = training_features[training_labels == 1].mean()
std_versicolor = training_features[training_labels == 1].std()

# find mean and standard deviation for virginica
mean_virginica = training_features[training_labels == 2].mean()
std_virginica = training_features[training_labels == 2].std()


In [8]:
#setosa probability
p_setosa_sepal_length = (1/(std_setosa[0]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['sepal length (cm)']-mean_setosa[0])**2)/(2*std_setosa[0]**2))
p_setosa_sepal_width = (1/(std_setosa[1]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['sepal width (cm)']-mean_setosa[1])**2)/(2*std_setosa[1]**2))
p_setosa_petal_length = (1/(std_setosa[2]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['petal length (cm)']-mean_setosa[2])**2)/(2*std_setosa[2]**2))
p_setosa_petal_width = (1/(std_setosa[3]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['petal width (cm)']-mean_setosa[3])**2)/(2*std_setosa[3]**2))
p_setosa = (1/3)*p_setosa_sepal_length*p_setosa_sepal_width*p_setosa_petal_length*p_setosa_petal_width

In [9]:
#versicolor probability
p_versicolor_sepal_length = (1/(std_versicolor[0]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['sepal length (cm)']-mean_versicolor[0])**2)/(2*std_versicolor[0]**2))
p_versicolor_sepal_width = (1/(std_versicolor[1]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['sepal width (cm)']-mean_versicolor[1])**2)/(2*std_versicolor[1]**2))
p_versicolor_petal_length = (1/(std_versicolor[2]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['petal length (cm)']-mean_versicolor[2])**2)/(2*std_versicolor[2]**2))
p_versicolor_petal_width = (1/(std_versicolor[3]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['petal width (cm)']-mean_versicolor[3])**2)/(2*std_versicolor[3]**2))
p_versicolor = (1/3)*p_versicolor_sepal_length*p_versicolor_sepal_width*p_versicolor_petal_length*p_versicolor_petal_width

In [10]:
#virginica probability
p_virginica_sepal_length = (1/(std_virginica[0]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['sepal length (cm)']-mean_virginica[0])**2)/(2*std_virginica[0]**2))
p_virginica_sepal_width = (1/(std_virginica[1]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['sepal width (cm)']-mean_virginica[1])**2)/(2*std_virginica[1]**2))  
p_virginica_petal_length = (1/(std_virginica[2]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['petal length (cm)']-mean_virginica[2])**2)/(2*std_virginica[2]**2))
p_virginica_petal_width = (1/(std_virginica[3]*np.sqrt(2*np.pi)))*np.exp(-((testing_features['petal width (cm)']-mean_virginica[3])**2)/(2*std_virginica[3]**2))
p_virginica = (1/3)*p_virginica_sepal_length*p_virginica_sepal_width*p_virginica_petal_length*p_virginica_petal_width

In [11]:
df_ans = pd.DataFrame()
df_ans['p_setosa'] = p_setosa
df_ans['p_versicolor'] = p_versicolor
df_ans['p_virginica'] = p_virginica
df_ans['target_names'] = testing_labels.apply(lambda x: data_iris.target_names[x])
# what column has the highest probability
df_ans['predicted'] = df_ans[['p_setosa', 'p_versicolor', 'p_virginica']].idxmax(axis=1)
df_ans['predicted'] = df_ans['predicted'].apply(lambda x: x.split('_')[1])
df_ans['correct'] = df_ans['target_names'] == df_ans['predicted']
df_ans

Unnamed: 0,p_setosa,p_versicolor,p_virginica,target_names,predicted,correct
28,1.948608,8.115335000000001e-17,2.15153e-34,setosa,setosa,True
2,1.035055,1.080163e-17,4.514421e-36,setosa,setosa,True
87,1.5183109999999999e-78,0.1135715,2.479717e-06,versicolor,versicolor,True
99,1.1735699999999998e-64,0.4100276,3.737494e-06,versicolor,versicolor,True
16,0.1990169,6.828447e-17,1.223033e-34,setosa,setosa,True
75,2.584482e-82,0.1824193,0.0004597,versicolor,versicolor,True
54,7.725975e-94,0.1840948,0.002809667,versicolor,versicolor,True
82,3.1349680000000003e-55,0.259636,1.815596e-07,versicolor,versicolor,True
10,0.9923013,4.3567120000000005e-17,2.3886069999999997e-34,setosa,setosa,True
146,8.577875e-130,0.002349085,0.02576415,virginica,virginica,True


In [12]:
# tatal accuracy
from sklearn.metrics import accuracy_score
accuracy_percent = accuracy_score(df_ans['target_names'], df_ans['predicted']) * 100
print ("Accuracy: %5.2f%%" %accuracy_percent)

Accuracy: 91.11%


In [13]:
# use Guussian Naive Bayes from sklearn
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
model = classifier.fit(training_features, training_labels)
predictions_labels = model.predict(testing_features)
df_ans['GaussianNB'] = predictions_labels
df_ans['GaussianNB'] = df_ans['GaussianNB'].apply(lambda x: data_iris.target_names[x])
df_ans['correct_GaussianNB'] = df_ans['target_names'] == df_ans['GaussianNB']
accuracy_percent = accuracy_score(testing_labels, predictions_labels) * 100
print ("Accuracy: %5.2f%%" %accuracy_percent)
df_ans['check equal'] = df_ans['correct'] == df_ans['correct_GaussianNB']

Accuracy: 91.11%


In [14]:
if all(df_ans['check equal']):
    print('success')
else:
    print('fail')
df_ans

success


Unnamed: 0,p_setosa,p_versicolor,p_virginica,target_names,predicted,correct,GaussianNB,correct_GaussianNB,check equal
28,1.948608,8.115335000000001e-17,2.15153e-34,setosa,setosa,True,setosa,True,True
2,1.035055,1.080163e-17,4.514421e-36,setosa,setosa,True,setosa,True,True
87,1.5183109999999999e-78,0.1135715,2.479717e-06,versicolor,versicolor,True,versicolor,True,True
99,1.1735699999999998e-64,0.4100276,3.737494e-06,versicolor,versicolor,True,versicolor,True,True
16,0.1990169,6.828447e-17,1.223033e-34,setosa,setosa,True,setosa,True,True
75,2.584482e-82,0.1824193,0.0004597,versicolor,versicolor,True,versicolor,True,True
54,7.725975e-94,0.1840948,0.002809667,versicolor,versicolor,True,versicolor,True,True
82,3.1349680000000003e-55,0.259636,1.815596e-07,versicolor,versicolor,True,versicolor,True,True
10,0.9923013,4.3567120000000005e-17,2.3886069999999997e-34,setosa,setosa,True,setosa,True,True
146,8.577875e-130,0.002349085,0.02576415,virginica,virginica,True,virginica,True,True
