In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_validate
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
%matplotlib inline

In [26]:
data = pd.read_csv('pima-indians-diabetes.csv')

In [27]:
X = data.iloc[:,:8]
Y = data.iloc[:,8]

In [28]:
skf = StratifiedKFold(n_splits=5)
discretisation_mode = 'QCUT'
bins_count = 5
scoring = {'acc': 'accuracy', 'prec_macro': 'precision_macro', 'rec_micro': 'recall_macro', 'f1': 'f1_macro'}

In [29]:
def discretizate(mode, column, bin_count):
    if mode == 'CUT':
        return pd.cut(column, bin_count, labels=False)
    elif mode == 'QCUT':
        return pd.qcut(column, bin_count, labels=False, duplicates='drop')
    elif mode == 'LOG':
        return np.digitize(column, np.geomspace(1, np.max(column), num=10))
    else:
        return column

# discretization
for column in X:
    bins = discretizate(discretisation_mode, X[column], bins_count)
    X[column] = bins
    data[column] = bins
    
model = MultinomialNB(alpha=1.0)

cross_validate(model, X, Y, scoring=scoring, cv=skf, return_train_score=True)

{'fit_time': array([0.00100255, 0.00050163, 0.00050163, 0.00100255, 0.00100279]),
 'score_time': array([0.00150394, 0.00150371, 0.00150394, 0.00100303, 0.00100255]),
 'test_acc': array([0.64935065, 0.5974026 , 0.67532468, 0.68627451, 0.67973856]),
 'test_f1': array([0.47933884, 0.44860245, 0.50691598, 0.54621849, 0.54155201]),
 'test_prec_macro': array([0.58098592, 0.47089552, 0.68680556, 0.67753623, 0.65556569]),
 'test_rec_micro': array([0.52555556, 0.48555556, 0.54981481, 0.56933962, 0.56433962]),
 'train_acc': array([0.67752443, 0.69543974, 0.67263844, 0.67317073, 0.66829268]),
 'train_f1': array([0.53491255, 0.58283649, 0.51101285, 0.54101921, 0.51597222]),
 'train_prec_macro': array([0.6578219 , 0.68323952, 0.65689144, 0.6412844 , 0.6375    ]),
 'train_rec_micro': array([0.56128505, 0.59241822, 0.54884346, 0.56267442, 0.54924419])}