In [1]:
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import pandas as pd
import numpy as np
import csv

### Load dataset

In [3]:
data = datasets.load_iris()
x = data.data
y = data.target

In [74]:
hi,bin_edges = np.histogram(x, bins='auto')
bin_edges

array([0.1       , 0.80909091, 1.51818182, 2.22727273, 2.93636364,
       3.64545455, 4.35454545, 5.06363636, 5.77272727, 6.48181818,
       7.19090909, 7.9       ])

### KFolds

In [156]:
folds = 10
kf = KFold(n_splits=folds, shuffle=True)
print(kf)

KFold(n_splits=10, random_state=None, shuffle=True)


In [165]:
sum = 0
sum_f1 = 0
for train_index, test_index in kf.split(x):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = GaussianNB().fit(x_train, y_train)
    y_pred = model.predict(x_test)
    errors = (x_test.shape[0],(y_test != y_pred).sum())
    accuracy = 100-errors[1]/errors[0]*100
    sum += accuracy
    sum_f1 += f1_score(y_test, y_pred, average='macro')
average = sum/folds
average_f1 = sum_f1/folds
print('Accuracy:', average)
print('F1:', average_f1)

Accuracy: 95.33333333333334
F1: 0.9463247863247866


### StratifiedKFolds

In [143]:
folds = 10
skf = StratifiedKFold(n_splits=folds, random_state=11)
print(skf)

StratifiedKFold(n_splits=10, random_state=11, shuffle=False)


In [146]:
sum = 0
sum_f1 = 0
for train_index, test_index in skf.split(x,y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = GaussianNB().fit(x_train, y_train)
    y_pred = model.predict(x_test)
    errors = (x_test.shape[0],(y_test != y_pred).sum())
    accuracy = 100-errors[1]/errors[0]*100
    sum += accuracy
    sum_f1 += f1_score(y_test, y_pred, average='macro')
average = sum/folds
average_f1 = sum_f1/folds
print('Accuracy:', average)
print('F1:', average_f1)

Accuracy: 95.33333333333333
F1: 0.9524410774410773


### Splitting 90/10 

In [7]:
test_train_size = 0.1
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_train_size, shuffle=True)
# create model
model = GaussianNB()
fit = model.fit(X_train, y_train)
print('Accuracy:', model.score(X_test, y_test))

Accuracy: 1.0
