In [25]:
import pandas as pd
import numpy as np
import seaborn as sbn
import matplotlib.pyplot as plt

In [26]:
df = pd.read_csv(r'D:\Datasets\All-Hyperparamter-Optimization-master\diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [27]:
df['Glucose']=np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df['Insulin']=np.where(df['Insulin']==0,df['Insulin'].median(),df['Insulin'])
df['SkinThickness']=np.where(df['SkinThickness']==0,df['SkinThickness'].median(),df['SkinThickness'])
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [28]:
x = df.drop('Outcome',axis=1)
y = df['Outcome']
print(x.shape,y.shape)

(768, 8) (768,)


In [29]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=33)

In [30]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10)
rfc.fit(x_train,y_train)

RandomForestClassifier(n_estimators=10)

In [31]:
y_pred = rfc.predict(x_test)
from sklearn import metrics
print(metrics.confusion_matrix(y_test,y_pred))
print(metrics.accuracy_score(y_test,y_pred))
print(metrics.classification_report(y_test,y_pred))

[[85 14]
 [30 25]]
0.7142857142857143
              precision    recall  f1-score   support

           0       0.74      0.86      0.79        99
           1       0.64      0.45      0.53        55

    accuracy                           0.71       154
   macro avg       0.69      0.66      0.66       154
weighted avg       0.70      0.71      0.70       154



#### Bayesian Optimization

In [32]:
from hyperopt import hp,fmin,Trials,STATUS_OK,tpe

In [38]:
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

In [39]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x246506667c0>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x24650666400>,
 'max_features': <hyperopt.pyll.base.Apply at 0x2465067f040>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x2465067fd00>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x2465067f670>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x2465067f0a0>}

In [40]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, x_train, y_train, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [41]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best


100%|███████████████████████████████████████████████| 80/80 [23:28<00:00, 17.61s/trial, best loss: -0.7720378515260563]


{'criterion': 1,
 'max_depth': 140.0,
 'max_features': 1,
 'min_samples_leaf': 0.010209162009441875,
 'min_samples_split': 0.1180605637918912,
 'n_estimators': 6}

In [42]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200,5:1300,6:1500}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

gini
sqrt
1500


In [45]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(x_train,y_train)
predictionforest = trainedforest.predict(x_test)
print(metrics.confusion_matrix(y_test,predictionforest))
print(metrics.accuracy_score(y_test,predictionforest))
print(metrics.classification_report(y_test,predictionforest))
acc5 = metrics.accuracy_score(y_test,predictionforest)

[[90  9]
 [31 24]]
0.7402597402597403
              precision    recall  f1-score   support

           0       0.74      0.91      0.82        99
           1       0.73      0.44      0.55        55

    accuracy                           0.74       154
   macro avg       0.74      0.67      0.68       154
weighted avg       0.74      0.74      0.72       154

