Here we performed Model selection and Hyper parameter tuning on **Diabetes after feature Engineering** dataset.

In [1]:

import pandas as pd
data = pd.read_csv('datasets/Diabetes After Feature Engineering.csv')
data.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2.330737,6.205037,36.329902,27.149304,20.023718,5.605497,0.310087,0.870964,1
1,0.738299,5.390749,33.865339,22.85438,11.929581,5.087564,0.225562,0.863878,0
2,2.695814,6.527653,33.033688,22.85438,20.067285,4.808004,0.320037,0.86447,1
3,0.738299,5.456704,33.865339,18.475901,13.590624,5.206304,0.132705,0.854735,0
4,0.0,6.089163,22.561724,27.149304,17.924559,6.193787,0.394806,0.865025,1


In [2]:
X = data.drop('Outcome', axis= 1)
Y = data['Outcome']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

((614, 8), (154, 8))

In [3]:
X_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
603,2.522655,6.225257,38.75235,22.85438,15.645104,5.712516,0.324205,0.871799
118,1.867137,5.58092,31.354045,18.475901,12.866988,5.214054,0.259198,0.856033
247,0.0,6.369532,43.485684,25.725884,24.556112,6.575816,0.253814,0.857217
157,0.738299,5.750825,29.651155,16.994122,16.166911,4.97224,0.349805,0.857217
468,2.695814,5.892249,37.546134,24.294427,17.046939,5.35022,0.142543,0.867347


## Model Selection with Cross Validation

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
import xgboost


In [5]:
RFC_Model = RandomForestClassifier(random_state=0, n_jobs= -1)
GBC_Model = GradientBoostingClassifier(random_state=0)
ABC_Model = AdaBoostClassifier(random_state=0, n_estimators=100)
DTC_Model = DecisionTreeClassifier(random_state=0)
LoR_Model = LogisticRegression(random_state=0, n_jobs=-1)
KNN_Model = KNeighborsClassifier(n_jobs=-1, n_neighbors = 5)
GNB_Model = GaussianNB()
SVM_Model = svm.SVC(random_state=0)
XGB_Model = xgboost.XGBClassifier(random_state=0, n_jobs=-1)

In [6]:
Models = (RFC_Model, GBC_Model, ABC_Model, DTC_Model, LoR_Model, KNN_Model, GNB_Model, SVM_Model, XGB_Model)

In [7]:
Selected_Models = []
scores =[]
for model in Models:
    print("For Model {}".format(model))
    score = cross_val_score(model, X, Y, cv=6, scoring = 'recall').mean()
    print("Recall Score = {}".format(score))
    if(score > 0.6):
        Selected_Models.append(model)
        scores.append(score)
    print()

For Model RandomForestClassifier(n_jobs=-1, random_state=0)
Recall Score = 0.6007575757575757

For Model GradientBoostingClassifier(random_state=0)
Recall Score = 0.6307239057239057

For Model AdaBoostClassifier(n_estimators=100, random_state=0)
Recall Score = 0.5974747474747474

For Model DecisionTreeClassifier(random_state=0)
Recall Score = 0.5936026936026936

For Model LogisticRegression(n_jobs=-1, random_state=0)
Recall Score = 0.5672558922558922

For Model KNeighborsClassifier(n_jobs=-1)
Recall Score = 0.49284511784511786

For Model GaussianNB()
Recall Score = 0.7053872053872055

For Model SVC(random_state=0)
Recall Score = 0.41397306397306394

For Model XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, mi

In [8]:
#these are selected models to imporove score by hyper parameter tuning
for model, score in zip(Selected_Models, scores):
    print(f"{score} =========> {str(model)}\n")




              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
              random_state=0, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)



``Gausian Naive Bayes`` performs ``best`` on our data

## Hyper parameter optimisation

**a) for Naive Bayes**

**Grid Search Cv**

In [9]:
import numpy as np
from sklearn.model_selection import GridSearchCV


In [10]:
smoothing = [1e-06,1e-07, 1e-08, 1e-09, 1e-10]
random_grid = {'var_smoothing' : smoothing}
grid_search = GridSearchCV(estimator= GNB_Model, param_grid= random_grid, cv =2 , n_jobs = -1, scoring='recall' ,verbose=2)
grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.7s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.7s finished


GridSearchCV(cv=2, estimator=GaussianNB(), n_jobs=-1,
             param_grid={'var_smoothing': [1e-06, 1e-07, 1e-08, 1e-09, 1e-10]},
             scoring='recall', verbose=2)

In [11]:
grid_search.best_estimator_

GaussianNB(var_smoothing=1e-08)

In [12]:
best_grid=grid_search.best_estimator_

In [14]:
from sklearn.metrics import classification_report, confusion_matrix, recall_score

In [15]:
y_pred=best_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Recall Score {}\n".format(recall_score(y_test,y_pred)))
print("Classification report:\n {}".format(classification_report(y_test,y_pred)))

[[86 21]
 [11 36]]
Recall Score 0.7659574468085106

Classification report:
               precision    recall  f1-score   support

           0       0.89      0.80      0.84       107
           1       0.63      0.77      0.69        47

    accuracy                           0.79       154
   macro avg       0.76      0.78      0.77       154
weighted avg       0.81      0.79      0.80       154



**b) for XGboost**

First find the bset n_estimators for an xg_boost

In [16]:
n_recalls = dict()
for i in range(2, 30):
    xgb = xgboost.XGBClassifier(n_estimators = i, n_jobs = -1, random_state = 0)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    score = recall_score(y_test, y_pred)
    n_recalls.update({i:score})
    

In [17]:
#n_recalls

uncommon out to see the results.

In [18]:
for i in range(2, 30):
    if(n_recalls[i] == max(n_recalls.values())):
        print("For n_estimators = {}, Recall Score = {}". format(i, n_recalls[i]))

For n_estimators = 17, Recall Score = 0.723404255319149


so we have to use ``n_estiamtors = 15-18`` for better performance of our model.

In [19]:
MaxDepth_recalls = dict()
for i in range(2, 30):
    xgb = xgboost.XGBClassifier(n_estimators = 17,max_depth = i, n_jobs = -1, random_state = 0)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    score = recall_score(y_test, y_pred)
    MaxDepth_recalls.update({i:score})
    

In [20]:
#MaxDepth_recalls

In [21]:
MaxDepth_recalls_1 = dict()
for i in range(2, 30):
    xgb = xgboost.XGBClassifier(n_estimators = 16,max_depth = i, n_jobs = -1, random_state = 0)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    score = recall_score(y_test, y_pred)
    MaxDepth_recalls_1.update({i:score})
    

In [22]:
#MaxDepth_recalls_1

From the above it is obvious that ``max_Depth`` is constant for ``[3, 5, 6, 7, 8]``, also performs better on our model

In [23]:
reg_lambda_1 = dict()
for i in np.linspace(0, 1):
    xgb = xgboost.XGBClassifier(n_estimators = 16, max_depth = 3, reg_lambda=i, n_jobs = -1, random_state = 0)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    score = recall_score(y_test, y_pred)
    reg_lambda_1.update({i:score})

In [24]:
#reg_lambda_1

In [25]:
for i in np.linspace(0,1):
    if(reg_lambda_1[i] == max(reg_lambda_1.values())):
        print("For reg_lambda = {}, Recall Score = {}". format(i, reg_lambda_1[i]))

For reg_lambda = 0.5714285714285714, Recall Score = 0.7659574468085106


so our parameters are ``reg_lambda=0.5789473684210527``, ``max_depth = 3``, ``n_estimators= 16``

In [26]:
xgb = xgboost.XGBClassifier(n_estimators = 16,max_depth = 3, reg_lambda=0.5789473684210527, n_jobs = -1, random_state = 0)
#recall score = 0.76
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=16, n_jobs=-1, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=0.5789473684210527, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [27]:
y_pred = xgb.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Recall Score {}".format(recall_score(y_test,y_pred)))
print("Classification report: \n{}".format(classification_report(y_test,y_pred)))

[[94 13]
 [11 36]]
Recall Score 0.7659574468085106
Classification report: 
              precision    recall  f1-score   support

           0       0.90      0.88      0.89       107
           1       0.73      0.77      0.75        47

    accuracy                           0.84       154
   macro avg       0.81      0.82      0.82       154
weighted avg       0.85      0.84      0.85       154



**c) for GradientBoostingClassifier**


In [28]:
n_estimators = dict()
for i in range(30, 100):
    GBC = GradientBoostingClassifier(n_estimators =i, random_state = 0)
    GBC.fit(X_train, y_train)
    y_pred = GBC.predict(X_test)
    score = recall_score(y_test, y_pred)
    n_estimators.update({i:score})

In [29]:
#n_estimators

selected **n_estimators = [ 52, 60]**

In [30]:
Max_depth_1 = dict()
for i in range(1, 20):
    GBC = GradientBoostingClassifier(n_estimators =52, max_depth=i, random_state = 0)
    GBC.fit(X_train, y_train)
    y_pred = GBC.predict(X_test)
    score = recall_score(y_test, y_pred)
    Max_depth_1.update({i:score})

In [31]:
#Max_depth_1       # 3 is selected

In [32]:
Max_depth_2 = dict()
for i in range(1, 20):
    GBC = GradientBoostingClassifier(n_estimators =60, max_depth=i, random_state = 0)
    GBC.fit(X_train, y_train)
    y_pred = GBC.predict(X_test)
    score = recall_score(y_test, y_pred)
    Max_depth_2.update({i:score})                                               

In [33]:
#Max_depth_2 

Hence ``n_estimators =[60, 52]`` and ``max_depth=3`` is selected

In [34]:
learning_rate_1 = dict()
for i in np.linspace(0.1, 1):
    GBC = GradientBoostingClassifier(n_estimators =52,learning_rate=i, max_depth=3, random_state = 0)
    GBC.fit(X_train, y_train)
    y_pred = GBC.predict(X_test)
    score = recall_score(y_test, y_pred)
    learning_rate_1.update({i:score})                                                

In [35]:
#learning_rate_1

In [36]:
learning_rate_2 = dict()
for i in np.linspace(0.1, 1):
    GBC = GradientBoostingClassifier(n_estimators =60, learning_rate=i, max_depth=3, random_state = 0)
    GBC.fit(X_train, y_train)
    y_pred = GBC.predict(X_test)
    score = recall_score(y_test, y_pred)
    learning_rate_2.update({i:score})                                               

In [37]:
#learning_rate_2

``learning_rate=[0.5224489795918368, 0.5040816326530613]``, ``n_estimators= 60``, ``max_depth=3`` is selected, Hence got an recall score of **0.78 and 0.74**

In [38]:
GBC = GradientBoostingClassifier(n_estimators =60, learning_rate= 0.5224489795918368, max_depth=3, random_state = 0)
GBC.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.5224489795918368, n_estimators=60,
                           random_state=0)

In [39]:
y_pred = GBC.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Recall Score: {}".format(recall_score(y_test,y_pred)))
print("Classification report: \n{}".format(classification_report(y_test,y_pred)))

[[89 18]
 [10 37]]
Recall Score: 0.7872340425531915
Classification report: 
              precision    recall  f1-score   support

           0       0.90      0.83      0.86       107
           1       0.67      0.79      0.73        47

    accuracy                           0.82       154
   macro avg       0.79      0.81      0.79       154
weighted avg       0.83      0.82      0.82       154



**d) for Random Forest Classifier**

In [40]:
n_estimators_1 = dict()
for i in range(1, 30):
    RFC = RandomForestClassifier(n_estimators =i,n_jobs=-1, random_state = 0)
    RFC.fit(X_train, y_train)
    y_pred = RFC.predict(X_test)
    score = recall_score(y_test, y_pred)
    n_estimators_1.update({i:score})
                                                

In [41]:
max(n_estimators_1.values())

0.723404255319149

In [42]:
#n_estimators_1

For Better Results ``n_estimators = [3, 5, 7]``

In [43]:
n_max_depth = dict()
for i in range(1, 30):
    RFC = RandomForestClassifier(n_estimators =3, max_depth= i,n_jobs=-1, random_state = 0)
    RFC.fit(X_train, y_train)
    y_pred = RFC.predict(X_test)
    score = recall_score(y_test, y_pred)
    n_max_depth.update({i:score})
    

In [44]:
max(n_max_depth.values())

0.723404255319149

In [45]:
#n_max_depth

In [46]:
n_max_depth_1 = dict()
for i in range(1, 30):
    RFC = RandomForestClassifier(n_estimators =7, max_depth= i,n_jobs=-1, random_state = 0)
    RFC.fit(X_train, y_train)
    y_pred = RFC.predict(X_test)
    score = recall_score(y_test, y_pred)
    n_max_depth_1.update({i:score})
    

In [47]:
max(n_max_depth_1.values())

0.7446808510638298

In [48]:
#n_max_depth_1

In [49]:
for i in range(1, 30):
    if(n_max_depth_1[i] == max(n_max_depth_1.values())):
        print("For n_estimators = 7, max_depth = {}, Recall Score = {}". format(i, n_max_depth_1[i]))

For n_estimators = 7, max_depth = 11, Recall Score = 0.7446808510638298


From this we conclude that ``n_estimators =[3, 7]`` and ``max_depth = [11, 5, 16]``

In [50]:
n_sample_split = dict()
for i in range(2, 20):
    RFC = RandomForestClassifier(n_estimators =7, min_samples_split= i, max_depth= 11,n_jobs=-1, random_state = 0)
    RFC.fit(X_train, y_train)
    y_pred = RFC.predict(X_test)
    score = recall_score(y_test, y_pred)
    n_sample_split.update({i:score})
    

In [51]:
#n_sample_split

In [52]:
n_impurity_dec = dict()
for i in [0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]:
    RFC = RandomForestClassifier(n_estimators =7, min_samples_split= 9, min_impurity_decrease= i,
                                 max_depth= 11,n_jobs=-1, random_state = 0)
    RFC.fit(X_train, y_train)
    y_pred = RFC.predict(X_test)
    score = recall_score(y_test, y_pred)
    n_impurity_dec.update({i:score})
    

In [53]:
#n_impurity_dec

so ``min_sample_split = [2, 7, 9]``, ``n_estimators= 7``, ``max_depth=11``

**Grid Search CV**

In [54]:
max_depth = [11, 5, 16]
n_estimators =[3, 7]
min_sample_split = [2, 7, 9]
min_sample_leaf = [2, 1, 7, 9]
max_features = ['auto', 'sqrt', 'log2']
min_impurity_decrease = [1e-5]
class_weight = ['balanced', 'balanced_subsample']

from sklearn.model_selection import GridSearchCV
param_gird = {'max_depth': max_depth,
              'n_estimators': n_estimators,
              'min_samples_split' : min_sample_split,
              'min_samples_leaf' : min_sample_leaf,
              'max_features' : max_features,
              'min_impurity_decrease' : min_impurity_decrease,
              'class_weight': class_weight}

In [55]:
grid_search = GridSearchCV(estimator= RFC_Model, param_grid= param_gird, scoring= 'recall', n_jobs= -1, cv= 2, verbose= 2)
grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 432 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:    4.3s finished


GridSearchCV(cv=2, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
             n_jobs=-1,
             param_grid={'class_weight': ['balanced', 'balanced_subsample'],
                         'max_depth': [11, 5, 16],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_impurity_decrease': [1e-05],
                         'min_samples_leaf': [2, 1, 7, 9],
                         'min_samples_split': [2, 7, 9],
                         'n_estimators': [3, 7]},
             scoring='recall', verbose=2)

In [56]:
grid_search.best_params_

{'class_weight': 'balanced_subsample',
 'max_depth': 5,
 'max_features': 'log2',
 'min_impurity_decrease': 1e-05,
 'min_samples_leaf': 9,
 'min_samples_split': 2,
 'n_estimators': 7}

In [57]:
best_grid = grid_search.best_estimator_

y_pred=best_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Recall Score {}".format(recall_score(y_test,y_pred)))
print("Classification report: \n{}".format(classification_report(y_test,y_pred)))

[[89 18]
 [ 9 38]]
Recall Score 0.8085106382978723
Classification report: 
              precision    recall  f1-score   support

           0       0.91      0.83      0.87       107
           1       0.68      0.81      0.74        47

    accuracy                           0.82       154
   macro avg       0.79      0.82      0.80       154
weighted avg       0.84      0.82      0.83       154



In [58]:
best_grid

RandomForestClassifier(class_weight='balanced_subsample', max_depth=5,
                       max_features='log2', min_impurity_decrease=1e-05,
                       min_samples_leaf=9, n_estimators=7, n_jobs=-1,
                       random_state=0)

Hence As of now ``Random Forest`` Classifer gives the **Highest Recall score**.