In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier 
from sklearn import datasets
from sklearn import tree
from matplotlib import pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, RocCurveDisplay, auc
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('../data/balanced_sclaer_dataset_diabetes.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.isnull().sum().sum()

0

In [3]:
y = df["Diabetes_012"]
X = df.drop(["Diabetes_012"], axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)

In [5]:
#KNN
knn_optimal = GridSearchCV(KNeighborsClassifier(), 
                                             {'n_neighbors': np.arange(3, 5, 1),
                                              'p': np.arange(1, 2, 1),
                                              "metric" : ["euclidean", "minkowski"]
                                              },
                                              ).fit(X_train[0:10000], y_train[0:10000])
knn_optimal.best_params_

{'metric': 'minkowski', 'n_neighbors': 3, 'p': 1}

In [6]:
#NB
gnb = GaussianNB()
GNB = gnb.fit(X_train[0:10000], y_train[0:10000])

In [7]:
#LB
log_reg_grid = {"C" : np.logspace(-4, 4, 20),
               "solver":["liblinear"]}
random_logr = RandomizedSearchCV(LogisticRegression(),
                               param_distributions=log_reg_grid,
                               cv=5,
                               n_iter=10,
                               random_state=0).fit(X_train[0:10000], y_train[0:10000])

In [8]:
random_logr.best_params_

{'solver': 'liblinear', 'C': 1.623776739188721}

In [9]:
estimators3 = [('model1', knn_optimal.best_estimator_), ('model2', GaussianNB()), ('model3', random_logr.best_estimator_)]

In [10]:
stacking_classifier_optimal = GridSearchCV(StackingClassifier(estimators3, estimators3[2][1]),
                                                          {
                                                              'n_jobs' : np.arange(2, 6, 1),
                                                          }
                                                          ).fit(X_train[0:10000], y_train[0:10000])
stacking_classifier_optimal.best_params_

{'n_jobs': 3}

In [11]:
pred_train = stacking_classifier_optimal.predict(X_test[0:30000])
pred_train

array([0., 0., 2., ..., 1., 0., 1.])

In [12]:
print(classification_report(y_test[0:30000],  pred_train))

              precision    recall  f1-score   support

         0.0       0.72      0.66      0.69     10129
         1.0       0.86      0.90      0.88     10040
         2.0       0.68      0.70      0.69      9831

    accuracy                           0.76     30000
   macro avg       0.75      0.76      0.75     30000
weighted avg       0.75      0.76      0.76     30000

