## Simple Application of Gaussian Naive Bayes
### Demo #1
* Iris data set
* No test set

In [1]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB

dataframe = pd.read_csv('iris.csv', 
                        header=None, 
                        names=['SL','SW','PL','PW','Class'], 
                        index_col=False)

# Create Gaussian Naive Bayes Object
clf = GaussianNB()

# Separate the features X from the labels Y
array = dataframe.values
X = array[:,:4]
Y = array[:,4]

# fitting the model to the data
clf.fit(X, Y) 

# Predicting class labels from the features
predictions = clf.predict(X)

# print the accuracy
print("Accuracy:", sum(predictions == Y)/len(predictions) )

Accuracy: 0.96


### Demo #2
* Iris data set
* Splitting the dataset into training and test sets

In [2]:
from sklearn.model_selection import train_test_split

# Create Gaussian Naive Bayes Object
clf = GaussianNB()

# Set aside  data as a part of test set
tpropn = 0.2
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=tpropn)

# fitting the model to the training data
clf.fit(X_train, Y_train) 

# Predicting class labels from the features
predictions = clf.predict(X_test)

# print the accuracy
print("Accuracy:", sum(predictions == Y_test)/len(predictions) )


Accuracy: 0.9666666666666667


### Demo #3
* Confusion matrix

In [3]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(predictions, Y_test)
print(cm)

[[13  0  0]
 [ 0 10  1]
 [ 0  0  6]]


### Demo #4
* Cross-Validation to calculate performance metrics
    * accuracy, f1, precision, recall

In [4]:
from sklearn.model_selection import cross_val_score

metric_str = 'f1_micro'
new_clf = GaussianNB()
cv_results = cross_val_score(new_clf, X, Y, cv=10, scoring=metric_str)

print("CV results:", cv_results)

#displaying the mean and standard deviation of the cross_validation results
print("%s: %f (%f)" % ('GNB '+metric_str, cv_results.mean(), cv_results.std()))

CV results: [0.93333333 0.93333333 1.         0.93333333 0.93333333 0.93333333
 0.86666667 1.         1.         1.        ]
GNB f1_micro: 0.953333 (0.042687)


### Demo #5
* Cross-Validation to calculate performance metrics
* Creating a validation set 

In [5]:
from sklearn.model_selection import cross_val_score

# Set aside  data as a part of test set
tpropn = 0.5
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=tpropn)

metric_str = 'accuracy'
new_clf = GaussianNB()
cv_results = cross_val_score(new_clf, X_train, Y_train, cv=5, scoring=metric_str)

#displaying the mean and standard deviation of the cross_validation results
print("%s: %f (%f)" % ('CV '+metric_str, cv_results.mean(), cv_results.std()))

# Predicting class labels from the features
predictions = clf.predict(X_test)

# print the accuracy
print("Validation Accuracy:", sum(predictions == Y_test)/len(predictions) )

CV accuracy: 0.946667 (0.049889)
Validation Accuracy: 0.96


### Demo #6
* Creating your own folds
     * StratifiedKFold, KFold

In [14]:
from sklearn import model_selection

kfold = model_selection.StratifiedKFold(n_splits=5, shuffle=True)

#calling the cross validation function
cv_results = model_selection.cross_val_score(GaussianNB(), X, Y, cv=kfold, scoring=metric_str)

#displaying the mean and standard deviation of the prediction
msg = "%s: %f (%f)" % ('NB accuracy', cv_results.mean(), cv_results.std())
print(msg)


NB accuracy: 0.953333 (0.045216)
