***IRIS FLOWER CLASSIFICATION USING NAIVE BAYES ALGORITHM***

***Importing Libraries***

In [1]:
import numpy as np
import pandas as pd 
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

***Importing Dataset***

In [2]:
df=pd.read_csv('IrisNB.csv')
print(df)

      Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm    species
0      1            5.1           3.5            1.4           0.2     setosa
1      2            4.9           3.0            1.4           0.2     setosa
2      3            4.7           3.2            1.3           0.2     setosa
3      4            4.6           3.1            1.5           0.2     setosa
4      5            5.0           3.6            1.4           0.2     setosa
..   ...            ...           ...            ...           ...        ...
145  146            6.7           3.0            5.2           2.3  virginica
146  147            6.3           2.5            5.0           1.9  virginica
147  148            6.5           3.0            5.2           2.0  virginica
148  149            6.2           3.4            5.4           2.3  virginica
149  150            5.9           3.0            5.1           1.8  virginica

[150 rows x 6 columns]


In [3]:
df.species.replace(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], [1, 2, 3], inplace=True)

In [4]:
clf = GaussianNB()
clf

***Initializing Gaussian Naive Bayes***

In [5]:
array = df.values
X = array[:,1:5]
Y = array[:,5]

***Considering One-third of data as a part of test set***

In [6]:
validation_size = 0.33
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [7]:
scoring = 'accuracy'

***Fitting the training set***

In [8]:
clf.fit(X_train, Y_train)

In [9]:
pred_clf = clf.predict(X_validation)
pred_clf

array(['virginica', 'versicolor', 'setosa', 'versicolor', 'versicolor',
       'setosa', 'virginica', 'versicolor', 'setosa', 'versicolor',
       'virginica', 'versicolor', 'setosa', 'virginica', 'setosa',
       'virginica', 'virginica', 'virginica', 'setosa', 'setosa',
       'versicolor', 'virginica', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'virginica', 'versicolor', 'setosa', 'virginica', 'versicolor',
       'setosa', 'setosa', 'setosa', 'setosa', 'virginica', 'virginica',
       'versicolor', 'virginica', 'virginica', 'versicolor', 'setosa',
       'versicolor', 'versicolor', 'virginica', 'setosa'], dtype='<U10')

***Prediction Probability***

In [10]:
prob_pos_clf = clf.predict_proba(X_validation)[:, 1]
prob_pos_clf

array([2.55970017e-02, 9.94743400e-01, 1.07108344e-17, 9.92960029e-01,
       9.83133501e-01, 6.67984413e-15, 3.73700413e-01, 9.99941349e-01,
       1.18131969e-17, 9.28258179e-01, 3.65749690e-02, 7.81290905e-01,
       1.13924567e-21, 3.32428395e-15, 3.68747037e-18, 5.82568059e-02,
       2.85289468e-04, 6.28632947e-04, 9.39134400e-19, 2.11693656e-18,
       9.99990875e-01, 2.06019866e-10, 9.91183239e-01, 9.83703907e-01,
       3.96279970e-04, 9.06382283e-02, 9.99444506e-01, 9.99999304e-01,
       1.06479432e-02, 5.57376556e-11, 1.07677068e-01, 9.99999696e-01,
       4.49185976e-20, 3.32826288e-07, 9.99079180e-01, 5.24494405e-16,
       2.86588278e-18, 7.22763977e-15, 3.10824704e-17, 1.15296954e-08,
       5.32665967e-08, 9.33513112e-01, 9.63556655e-08, 6.70206490e-04,
       9.99934547e-01, 4.01857910e-18, 9.79169998e-01, 9.99998886e-01,
       9.29543990e-04, 4.29329574e-17])

In [11]:
pred_clf_df = pd.DataFrame(pred_clf.reshape(50,1))
pred_clf_df

Unnamed: 0,0
0,virginica
1,versicolor
2,setosa
3,versicolor
4,versicolor
5,setosa
6,virginica
7,versicolor
8,setosa
9,versicolor


In [12]:
pred_clf_df.rename(columns={0:'Prediction'}, inplace=True)

***Reshaping the test dataset***

In [13]:
X_validation_df = pd.DataFrame(X_validation.reshape(50,4))
X_validation_df

Unnamed: 0,0,1,2,3
0,5.9,3.0,5.1,1.8
1,5.4,3.0,4.5,1.5
2,5.0,3.5,1.3,0.3
3,5.6,3.0,4.5,1.5
4,4.9,2.5,4.5,1.7
5,4.5,2.3,1.3,0.3
6,6.9,3.1,4.9,1.5
7,5.6,2.7,4.2,1.3
8,4.8,3.4,1.6,0.2
9,6.4,3.2,4.5,1.5


In [14]:
pred_outcome = pd.concat([X_validation_df, pred_clf_df], axis=1, )
join_axes=[X_validation_df.index]

pred_outcome.rename(columns = {0:'SepalLengthCm', 1:'SepalWidthCm', 2:'PetalLengthCm', 3:'PetalWidthCm'}, inplace=True)

del df['Id']

In [15]:
pred_comp = pd.merge(df,pred_outcome, on=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm'])

***Printing top 10 lines of the Final Predictions***

In [16]:
print((pred_comp).head(10))

  SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm species Prediction
0           5.4          3.9           1.7          0.4  setosa     setosa
1           4.9          3.1           1.5          0.1  setosa     setosa
2           4.9          3.1           1.5          0.1  setosa     setosa
3           4.9          3.1           1.5          0.1  setosa     setosa
4           4.8          3.4           1.6          0.2  setosa     setosa
5           5.1          3.5           1.4          0.3  setosa     setosa
6           4.6          3.6           1.0          0.2  setosa     setosa
7           5.2          3.4           1.4          0.2  setosa     setosa
8           4.7          3.2           1.6          0.2  setosa     setosa
9           5.2          4.1           1.5          0.1  setosa     setosa


***Model Performance***

In [17]:
kfold = model_selection.KFold(n_splits=10, random_state=seed ,shuffle=True)

In [18]:
cv_results = model_selection.cross_val_score(GaussianNB(), X_train, Y_train, cv=kfold, scoring=scoring)

In [19]:
msg = "%s: %f (%f)" % ('NB accuracy', cv_results.mean()*100, cv_results.std())
print(msg)

NB accuracy: 97.000000 (0.045826)
