In [33]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier 
from sklearn import datasets
from sklearn import tree
from matplotlib import pyplot as plt
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, RocCurveDisplay, auc
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [34]:
df = pd.read_csv('../data/balanced_sclaer_dataset_diabetes.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.isnull().sum().sum()

0

In [35]:
y = df["Diabetes_012"]
X = df.drop(["Diabetes_012"], axis=1)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)

In [37]:
bag_model = BaggingClassifier(
            estimator=DecisionTreeClassifier(), 
            n_estimators=10, 
            max_samples=0.8, 
            bootstrap=True,
            # oob_score=True,
            random_state=0
            )

In [38]:
bag_model.fit(X_train, y_train)

In [39]:
# bag_model.oob_score_

In [40]:
bag_model.score(X_test, y_test)

0.7985012964071003

In [41]:
predicted_bagging_classifier = bag_model.predict(X_test)
predicted_bagging_classifier

array([1., 2., 1., ..., 1., 1., 2.])

In [42]:
print(classification_report(y_test, predicted_bagging_classifier))

              precision    recall  f1-score   support

         0.0       0.70      0.73      0.71     11742
         1.0       0.98      0.98      0.98     11670
         2.0       0.72      0.69      0.71     11685

    accuracy                           0.80     35097
   macro avg       0.80      0.80      0.80     35097
weighted avg       0.80      0.80      0.80     35097



In [43]:
optimal_by_randomForest = RandomizedSearchCV(RandomForestClassifier(), 
                                             {'n_estimators': np.arange(10, 30, 2),
                                              'criterion': ["gini", "entropy", "log_loss"], 
                                              'max_depth': np.array(range(1, 2*X_train.shape[1]+1))
                                              },
                                              scoring='accuracy',
                                              random_state=71
                                              ).fit(X_train, y_train)
optimal_by_randomForest.best_params_

{'n_estimators': 28, 'max_depth': 24, 'criterion': 'log_loss'}

In [44]:
pred_train = optimal_by_randomForest.predict(X_test)

In [45]:
pred_train

array([1., 2., 1., ..., 1., 1., 2.])

In [46]:
print(accuracy_score(y_test, pred_train))

0.8170213978402713


In [47]:
print(classification_report(y_test,  pred_train))

              precision    recall  f1-score   support

         0.0       0.75      0.70      0.72     11742
         1.0       0.98      0.98      0.98     11670
         2.0       0.72      0.77      0.75     11685

    accuracy                           0.82     35097
   macro avg       0.82      0.82      0.82     35097
weighted avg       0.82      0.82      0.82     35097



In [None]:
#b_p for random'n_estimators': 28, 'max_depth': 24, 'criterion': 'log_loss'}
#b_p decision{'criterion': 'log_loss', 'max_depth': 14}

In [69]:
optimal_bagging = RandomizedSearchCV(BaggingClassifier(), 
                                             {'estimator': [RandomForestClassifier(n_estimators=28, max_depth=24, criterion='log_loss'), DecisionTreeClassifier(max_depth=14, criterion='log_loss')],
                                              'n_estimators': np.arange(1, 10, 1), 
                                              },
                                              n_iter=4,
                                              n_jobs=4,
                                              scoring='accuracy',
                                              ).fit(X_train, y_train)
optimal_bagging.best_params_

{'n_estimators': 7,
 'estimator': RandomForestClassifier(criterion='log_loss', max_depth=24, n_estimators=28)}

In [71]:
pred_train2 = optimal_bagging.predict(X_test)

In [72]:
pred_train2

array([1., 2., 1., ..., 1., 1., 2.])

In [73]:
print(classification_report(y_test, pred_train2))

              precision    recall  f1-score   support

         0.0       0.76      0.70      0.73     11742
         1.0       0.98      0.98      0.98     11670
         2.0       0.72      0.79      0.75     11685

    accuracy                           0.82     35097
   macro avg       0.82      0.82      0.82     35097
weighted avg       0.82      0.82      0.82     35097



In [None]:
# from sklearn import model_selection
# from sklearn.model_selection import cross_val_score

In [None]:
# results = model_selection.cross_val_score(KNeighborsClassifier(), df_fit, pred, cv = 3)