In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
def check_zeros(df,feature):
    print((df[feature]==0).sum())
    

In [4]:
check_zeros(df,'SkinThickness')

227


In [5]:
def replace_zeros(df,feature):
    df[feature]=np.where(df[feature]==0,df[feature].median(),df[feature])

In [6]:
replace_zeros(df,'Glucose')
replace_zeros(df,'Insulin')
replace_zeros(df,"SkinThickness")

In [7]:
check_zeros(df,'SkinThickness')

0


In [8]:
x=df.drop('Outcome',axis=1)
y=df['Outcome']

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=0)

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf_classif=RandomForestClassifier(n_estimators=10,random_state=10)
rf_classif.fit(x_train,y_train)
prediction=rf_classif.predict(x_test)

In [12]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[91 16]
 [16 31]]
0.7922077922077922
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       107
           1       0.66      0.66      0.66        47

    accuracy                           0.79       154
   macro avg       0.76      0.76      0.76       154
weighted avg       0.79      0.79      0.79       154



The important parameters Randomforest classifier:
1. n_estimators = no.of trees in a ensemble.
2. min_samples_split = min no.of samples necessary in a node to cause node splitting.
3. min_samples_leaf = min no.of samples which can be stored in tree leaf
4. creterion= used to evaluate the quality of a split.
5. max_depth= max no.of levels allowed in each tree.
6. max_features = max no.of features considered when a spliting a node.

###### manual hyper parameter tuning

In [13]:
from sklearn.ensemble import RandomForestClassifier
rf_classif=RandomForestClassifier(n_estimators=350,criterion="entropy",max_features="sqrt",min_samples_leaf=10,random_state=100)
rf_classif.fit(x_train,y_train)
prediction=rf_classif.predict(x_test)

In [14]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[97 10]
 [18 29]]
0.8181818181818182
              precision    recall  f1-score   support

           0       0.84      0.91      0.87       107
           1       0.74      0.62      0.67        47

    accuracy                           0.82       154
   macro avg       0.79      0.76      0.77       154
weighted avg       0.81      0.82      0.81       154



###### Randomized searchCV

In [16]:
max_depth=[int(x)for x in np.linspace(start=10,stop=1500,num=10)]
max_depth

[10, 175, 341, 506, 672, 837, 1003, 1168, 1334, 1500]

In [17]:
random_grid={"n_estimators":[100,200,200,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500],
             'min_samples_split':[1,2,3,4,5,6],
             'min_samples_leaf':[2,3,5,6,7],
             'max_features':['auto','sqrt','log2'],
             'max_depth':max_depth,
             'criterion':["entropy",'gini']
}
random_grid

{'n_estimators': [100,
  200,
  200,
  400,
  500,
  600,
  700,
  800,
  900,
  1000,
  1100,
  1200,
  1300,
  1400,
  1500],
 'min_samples_split': [1, 2, 3, 4, 5, 6],
 'min_samples_leaf': [2, 3, 5, 6, 7],
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [10, 175, 341, 506, 672, 837, 1003, 1168, 1334, 1500],
 'criterion': ['entropy', 'gini']}

In [12]:
from sklearn.model_selection import RandomizedSearchCV

In [19]:
rf=RandomForestClassifier()
rf_searchcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=200,
                              cv=3,verbose=2,random_state=34,n_jobs=-1)
rf_searchcv.fit(x_train,y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   42.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 10.3min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=200,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 175, 341, 506, 672,
                                                      837, 1003, 1168, 1334,
                                                      1500],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [2, 3, 5, 6, 7],
                                        'min_samples_split': [1, 2, 3, 4, 5, 6],
                                        'n_estimators': [100, 200, 200, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100, 1200,
                                                         1300, 1400, 1500]},

In [20]:
rf_searchcv.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 3,
 'max_features': 'sqrt',
 'max_depth': 1334,
 'criterion': 'gini'}

In [21]:
rf_searchcv.best_estimator_

RandomForestClassifier(max_depth=1334, max_features='sqrt', min_samples_leaf=3)

In [22]:
best_randomized=rf_searchcv.best_estimator_
y_prediction=best_randomized.predict(x_test)
print(confusion_matrix(y_test,y_prediction))
print(classification_report(y_test,y_prediction))
print(accuracy_score(y_test,y_prediction))

[[95 12]
 [19 28]]
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       107
           1       0.70      0.60      0.64        47

    accuracy                           0.80       154
   macro avg       0.77      0.74      0.75       154
weighted avg       0.79      0.80      0.79       154

0.7987012987012987


##### Grid Searchcv

In [76]:
rf_searchcv.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'criterion': 'gini'}

In [104]:
# creating grid
grid_={'n_estimators':[rf_searchcv.best_params_['n_estimators'],
                       rf_searchcv.best_params_['n_estimators']+200,
                       rf_searchcv.best_params_['n_estimators']+300,
                       rf_searchcv.best_params_['n_estimators']+400],
       
 'min_samples_split': [rf_searchcv.best_params_['min_samples_split'],
                       rf_searchcv.best_params_['min_samples_split']+2,
                       rf_searchcv.best_params_['min_samples_split']+4],
 'min_samples_leaf':[rf_searchcv.best_params_['min_samples_leaf'],
                       rf_searchcv.best_params_['min_samples_leaf']+2,
                       rf_searchcv.best_params_['min_samples_leaf']+4],
       
 'max_features':[rf_searchcv.best_params_['max_features']],
 'max_depth':[rf_searchcv.best_params_['max_depth']] ,
 'criterion': [rf_searchcv.best_params_['criterion']]}
grid_

{'n_estimators': [100, 300, 400, 500],
 'min_samples_split': [2, 4, 6],
 'min_samples_leaf': [2, 4, 6],
 'max_features': ['sqrt'],
 'max_depth': [10],
 'criterion': ['gini']}

In [105]:
from sklearn.model_selection import GridSearchCV

In [106]:
rf=RandomForestClassifier()
grid_cv=GridSearchCV(estimator=rf,param_grid=grid_,cv=5,n_jobs=-1,verbose=2)
grid_cv.fit(x_train,y_train)


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  1.3min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [10],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [2, 4, 6],
                         'min_samples_split': [2, 4, 6],
                         'n_estimators': [100, 300, 400, 500]},
             verbose=2)

In [97]:
grid_cv.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 4,
 'n_estimators': 400}

In [98]:
grid_cv.best_estimator_

RandomForestClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=4, n_estimators=400)

In [100]:
best_gridsearchcv=grid_cv.best_estimator_
y_prediction=best_gridsearchcv.predict(x_test)
print(confusion_matrix(y_test,y_prediction))
print(classification_report(y_test,y_prediction))
print(accuracy_score(y_test,y_prediction))

[[96 11]
 [18 29]]
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       107
           1       0.72      0.62      0.67        47

    accuracy                           0.81       154
   macro avg       0.78      0.76      0.77       154
weighted avg       0.81      0.81      0.81       154

0.8116883116883117


## Automated Hyper Parameter tuning :
It can be done using:
1. Bayesian Optimization
2. Gradient Descent
3. Evolutionary Algorithms

1. Bayesian Optimization:
it is implemented using  3 parameters in fmin--objective function, domain space, optimization algorithm

In [48]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [49]:
domain_space={"criterion":hp.choice('criterion',['entropy','gini']),
              "max_depth":hp.quniform("max_depth",10,1200,10),
              "min_samples_leaf":hp.uniform("min_samples_leaf",0,0.5),
              "min_samples_split":hp.uniform("min_samples_split",0,1),
              "n_estimators":hp.choice("n_estimators",[50,100,150,200,250,300,350,400,450,500]),
              "max_features":hp.choice("max_features",['auto','sqrt','log2'])
    
}

In [54]:
domain_space['criterion']

<hyperopt.pyll.base.Apply at 0x203f3cc74c0>

In [51]:
from sklearn.model_selection import cross_val_score

In [60]:
def objective_function(space):
    model=RandomForestClassifier(n_estimators=space['n_estimators'],
                                 criterion=space['criterion'],
                                 max_depth=space['max_depth'],
                                max_features=space['max_features'],
                                min_samples_leaf=space['min_samples_leaf'],
                                min_samples_split=space['min_samples_split'],
                                )
    accuracy=cross_val_score(model,x_train,y_train,cv=4).mean()
    return {"loss":-accuracy,'status':STATUS_OK}

In [61]:
from sklearn.model_selection import cross_val_score
trials=Trials()
best=fmin(fn=objective_function,space=domain_space,
         algo=tpe.suggest,max_evals=80,
         trials=trials)
best

100%|███████████████████████████████████████████████| 80/80 [02:53<00:00,  2.17s/trial, best loss: -0.7638358373652492]


{'criterion': 1,
 'max_depth': 600.0,
 'max_features': 2,
 'min_samples_leaf': 0.00042630911961526186,
 'min_samples_split': 0.13477064060209237,
 'n_estimators': 3}

In [70]:
en_criterion={0:"entropy",1:'gini'}
en_esti={0:50,1:100,2:150,3:200,4:250,5:300,6:350,7:400,8:450,9:500}
en_mxfeat ={0: 'auto',1:'sqrt',2:'log2'}

print(en_criterion[best['criterion']])
print(en_esti[best['n_estimators']])
print(en_mxfeat[best['max_features']])

gini
200
log2


In [73]:
trained_model=RandomForestClassifier(criterion=en_criterion[best['criterion']],
                                     max_depth=best["max_depth"], max_features=en_mxfeat[best['max_features']],
                                     min_samples_leaf=best["min_samples_leaf"],
                                     min_samples_split=best["min_samples_split"],
                                     n_estimators=en_esti[best["n_estimators"]]).fit(x_train,y_train)
prediction_y=trained_model.predict(x_test)
print(confusion_matrix(y_test,y_prediction))
print(classification_report(y_test,y_prediction))
print(accuracy_score(y_test,y_prediction))

[[95 12]
 [14 33]]
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       107
           1       0.73      0.70      0.72        47

    accuracy                           0.83       154
   macro avg       0.80      0.79      0.80       154
weighted avg       0.83      0.83      0.83       154

0.8311688311688312


##### Generic Algorithms

In [14]:
parameters={"n_estimators":[100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500],
             'min_samples_split':[1,2,3,4,5,6],
             'min_samples_leaf':[2,3,5,6,7],
             'max_features':['auto','sqrt','log2'],
             'max_depth':[int(x) for x in np.linspace(start=10,stop=1000,num=10)],
             'criterion':["entropy",'gini']
}
parameters

{'n_estimators': [100,
  200,
  300,
  400,
  500,
  600,
  700,
  800,
  900,
  1000,
  1100,
  1200,
  1300,
  1400,
  1500],
 'min_samples_split': [1, 2, 3, 4, 5, 6],
 'min_samples_leaf': [2, 3, 5, 6, 7],
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000],
 'criterion': ['entropy', 'gini']}

In [15]:
from tpot import TPOTClassifier
tpot_classif=TPOTClassifier(generations=4,population_size=20,offspring_size=10,
                           verbosity=2,early_stop=12,
                           config_dict={'sklearn.ensemble.RandomForestClassifier':parameters},
                           cv=4, scoring='accuracy')
tpot_classif.fit(x_train,y_train)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=60.0, style=ProgressStyle(des…


Generation 1 - Current best internal CV score: 0.760578473813768
Generation 2 - Current best internal CV score: 0.760578473813768
Generation 3 - Current best internal CV score: 0.760578473813768
Generation 4 - Current best internal CV score: 0.760578473813768
Best pipeline: RandomForestClassifier(RandomForestClassifier(input_matrix, criterion=gini, max_depth=120, max_features=log2, min_samples_leaf=7, min_samples_split=3, n_estimators=1500), criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=6, min_samples_split=6, n_estimators=1400)


TPOTClassifier(config_dict={'sklearn.ensemble.RandomForestClassifier': {'criterion': ['entropy',
                                                                                      'gini'],
                                                                        'max_depth': [10,
                                                                                      120,
                                                                                      230,
                                                                                      340,
                                                                                      450,
                                                                                      560,
                                                                                      670,
                                                                                      780,
                                                                                 

In [16]:
accuracy=tpot_classif.score(x_test,y_test)
accuracy

0.8506493506493507

#### Optimize hyperparameters of the model using Optuna

In [19]:
import optuna
import sklearn.svm
def objective(trial):

    classifier = trial.suggest_categorical('classifier', ['RandomForest', 'SVC'])
    
    if classifier == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 200, 2000,10)
        max_depth = int(trial.suggest_float('max_depth', 10, 100, log=True))

        clf = sklearn.ensemble.RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth)
    else:
        c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)
        
        clf = sklearn.svm.SVC(C=c, gamma='auto')

    return sklearn.model_selection.cross_val_score(
        clf,x_train,y_train, n_jobs=-1, cv=3).mean()

In [22]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[I 2020-09-17 19:46:27,500] A new study created in memory with name: no-name-bc974162-f357-4139-8430-357edae323d4
[I 2020-09-17 19:46:27,571] Trial 0 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', 'svc_c': 3.904746679337756}. Best is trial 0 with value: 0.640068547744301.
[I 2020-09-17 19:46:33,566] Trial 1 finished with value: 0.7508050374621393 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1350, 'max_depth': 30.103749827551546}. Best is trial 1 with value: 0.7508050374621393.
[I 2020-09-17 19:46:35,934] Trial 2 finished with value: 0.7459110473457676 and parameters: {'classifier': 'RandomForest', 'n_estimators': 480, 'max_depth': 33.8992748172616}. Best is trial 1 with value: 0.7508050374621393.
[I 2020-09-17 19:46:36,003] Trial 3 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', 'svc_c': 16774.039120930855}. Best is trial 1 with value: 0.7508050374621393.
[I 2020-09-17 19:46:36,075] Trial 4 finished with value

Accuracy: 0.7573170731707317
Best hyperparameters: {'classifier': 'RandomForest', 'n_estimators': 1370, 'max_depth': 20.024156817566237}


In [24]:
trial

FrozenTrial(number=64, value=0.7573170731707317, datetime_start=datetime.datetime(2020, 9, 17, 19, 51, 38, 906438), datetime_complete=datetime.datetime(2020, 9, 17, 19, 51, 45, 429557), params={'classifier': 'RandomForest', 'n_estimators': 1370, 'max_depth': 20.024156817566237}, distributions={'classifier': CategoricalDistribution(choices=('RandomForest', 'SVC')), 'n_estimators': IntUniformDistribution(high=2000, low=200, step=10), 'max_depth': LogUniformDistribution(high=100, low=10)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=64, state=TrialState.COMPLETE)

In [25]:
study.best_params

{'classifier': 'RandomForest',
 'n_estimators': 1370,
 'max_depth': 20.024156817566237}

In [28]:
from sklearn.ensemble import RandomForestClassifier 
rf=RandomForestClassifier(n_estimators=1370,max_depth=20)
rf.fit(x_train,y_train)

RandomForestClassifier(max_depth=20, n_estimators=1370)

In [30]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

y_pred=rf.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[95 12]
 [15 32]]
0.8246753246753247
              precision    recall  f1-score   support

           0       0.86      0.89      0.88       107
           1       0.73      0.68      0.70        47

    accuracy                           0.82       154
   macro avg       0.80      0.78      0.79       154
weighted avg       0.82      0.82      0.82       154

