## Importing libraries

In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, roc_auc_score
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
import itertools

In [9]:
df_diabetes = pd.read_csv('diabetes.csv')
df_diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Unnamed: 9,Unnamed: 10
0,6,148,72,35,0,33.6,0.627,50,1,,
1,1,85,66,29,0,26.6,0.351,31,0,,
2,8,183,64,0,0,23.3,0.672,32,1,,
3,1,89,66,23,94,28.1,0.167,21,0,,
4,0,137,40,35,168,43.1,2.288,33,1,,
...,...,...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0,,
764,2,122,70,27,0,36.8,0.340,27,0,,
765,5,121,72,23,112,26.2,0.245,30,0,,
766,1,126,60,0,0,30.1,0.349,47,1,,


## Exploratory data analysis

In [10]:
df_diabetes.isnull().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                       0
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
Unnamed: 9                  768
Unnamed: 10                 768
dtype: int64

In [11]:
df_diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
 9   Unnamed: 9                0 non-null      float64
 10  Unnamed: 10               0 non-null      float64
dtypes: float64(4), int64(7)
memory usage: 66.1 KB


In [12]:
df_diabetes.drop(["Unnamed: 9","Unnamed: 10"], inplace=True, axis=1)
df_diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [13]:
X = df_diabetes.drop(["Outcome"],axis=1)   #Feature Matrix
y = df_diabetes["Outcome"]

## Splitting data

In [9]:
# splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [10]:
# splitting the dataset into validation and test set
X_val, X_test, y_val, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [11]:
forest = RandomForestClassifier(random_state = 30)

## Hyperparameter Tuning

In [15]:
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(forest, hyperF, cv = 5, verbose = 1, 
                      n_jobs = -1)
best_fit = gridF.fit(X_train, y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


In [16]:
best_fit.best_params_

{'max_depth': 5,
 'min_samples_leaf': 10,
 'min_samples_split': 100,
 'n_estimators': 100}

## Bagging RandomForestClassifier

In [17]:
base_model = RandomForestClassifier(max_depth= 5, min_samples_leaf= 1, min_samples_split= 15, n_estimators= 300, random_state = 30)
base_model.fit(X_train, y_train)

In [18]:
predictions = base_model.predict(X_test)
# # Display F1 score for test data
f1_score(y_test,predictions,pos_label=0)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.80      0.94      0.86       253
           1       0.82      0.54      0.65       131

    accuracy                           0.80       384
   macro avg       0.81      0.74      0.76       384
weighted avg       0.80      0.80      0.79       384



In [19]:
predictions = base_model.predict(X_val)
# Display F1 score  validation data
f1_score(y_val,predictions,pos_label=0)
print(classification_report(y_val,predictions))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86       247
           1       0.78      0.69      0.73       137

    accuracy                           0.82       384
   macro avg       0.81      0.79      0.80       384
weighted avg       0.82      0.82      0.82       384



## Model 
## Boosting Gradient

In [20]:
param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

In [21]:
xgb_cl = xgb.XGBClassifier(objective="binary:logistic")

# Init Grid Search
grid_cv = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=5, scoring="roc_auc")

# Fit
_ = grid_cv.fit(X_train, y_train)

In [22]:
grid_cv.best_params_

{'colsample_bytree': 0.5,
 'gamma': 0,
 'learning_rate': 0.05,
 'max_depth': 3,
 'reg_lambda': 1,
 'scale_pos_weight': 3,
 'subsample': 0.8}

In [23]:
xgb_cl = xgb.XGBClassifier(colsample_bytree= 0.5, gamma = 0, learning_rate = 0.05, max_depth = 3, reg_lambda = 1, scale_pos_weight = 3, subsample = 0.8, objective="binary:logistic")
xgb_cl.fit(X_train, y_train)

In [24]:
predictions = xgb_cl.predict(X_test)
# Display F1 score for test data
f1_score(y_test,predictions,pos_label=0)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.92      0.72      0.81       253
           1       0.62      0.89      0.73       131

    accuracy                           0.78       384
   macro avg       0.77      0.80      0.77       384
weighted avg       0.82      0.78      0.78       384



In [25]:
# Display F1 score for validation data
predictions = xgb_cl.predict(X_val)
f1_score(y_val,predictions,pos_label=0)
print(classification_report(y_val,predictions))

              precision    recall  f1-score   support

           0       0.95      0.72      0.82       247
           1       0.65      0.93      0.76       137

    accuracy                           0.79       384
   macro avg       0.80      0.82      0.79       384
weighted avg       0.84      0.79      0.80       384



## Model
## Stacking 

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [14]:
dtc =  DecisionTreeClassifier()
rfc = KNeighborsClassifier()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [16]:
dtc.fit(X_train, y_train)

In [17]:
prediction_test = dtc.predict(X_test)

In [18]:
rfc.fit(X_train, y_train)

In [19]:
prediction_test2 = rfc.predict(X_test)

In [20]:
clf = [dtc,rfc]
for algo in clf:
    score = cross_val_score( algo,X,y,cv = 5,scoring = 'accuracy')
    print("The accuracy score of {} is:".format(algo),score.mean())

The accuracy score of DecisionTreeClassifier() is: 0.720151090739326
The accuracy score of KNeighborsClassifier() is: 0.723979288685171


In [33]:
dtc =  DecisionTreeClassifier()
rfc = RandomForestClassifier()

In [36]:
clf = [('dtc',dtc),('rfc',rfc)] #list of (str, estimator)

In [40]:
lr = LogisticRegression()
stack_model = StackingClassifier( estimators = clf,final_estimator = lr)
score = cross_val_score(stack_model,X,y,cv = 5,scoring = 'accuracy')
print("The accuracy score of is:",score.mean())


The accuracy score of is: 0.7708938120702825
