In [42]:
import pandas as pd
import numpy as np
import seaborn as sbn
import matplotlib.pyplot as plt

In [43]:
df = pd.read_csv(r'D:\Datasets\All-Hyperparamter-Optimization-master\diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [44]:
df.shape

(768, 9)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [46]:
df['Glucose'] = np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df['Insulin'] = np.where(df['Insulin']==0,df['Insulin'].median(),df['Insulin'])
df['SkinThickness'] = np.where(df['SkinThickness']==0,df['SkinThickness'].median(),df['SkinThickness'])
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [47]:
x = df.drop('Outcome',axis=1)
y = df['Outcome']

In [48]:
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35.0,30.5,33.6,0.627,50
1,1,85.0,66,29.0,30.5,26.6,0.351,31
2,8,183.0,64,23.0,30.5,23.3,0.672,32
3,1,89.0,66,23.0,94.0,28.1,0.167,21
4,0,137.0,40,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76,48.0,180.0,32.9,0.171,63
764,2,122.0,70,27.0,30.5,36.8,0.340,27
765,5,121.0,72,23.0,112.0,26.2,0.245,30
766,1,126.0,60,23.0,30.5,30.1,0.349,47


In [49]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=33)

In [50]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10)
rfc.fit(x_train,y_train)
y_pred = rfc.predict(x_test)

In [51]:
y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [52]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,y_pred))
print(metrics.accuracy_score(y_test,y_pred))
print(metrics.classification_report(y_test,y_pred))

[[82 17]
 [29 26]]
0.7012987012987013
              precision    recall  f1-score   support

           0       0.74      0.83      0.78        99
           1       0.60      0.47      0.53        55

    accuracy                           0.70       154
   macro avg       0.67      0.65      0.66       154
weighted avg       0.69      0.70      0.69       154



The main parameters used by a Random Forest Classifier are:


criterion = the function used to evaluate the quality of a split.

max_depth = maximum number of levels allowed in each tree.

max_features = maximum number of features considered when splitting a node.

min_samples_leaf = minimum number of samples which can be stored in a tree leaf.

min_samples_split = minimum number of samples necessary in a node to cause node splitting.

n_estimators = number of trees in the ensemble.

#### Manual hyperparameter-tuning

In [53]:
model = RandomForestClassifier(n_estimators = 300 ,criterion='entropy',min_samples_leaf=10,max_features='sqrt',random_state=100)
model.fit(x_train,y_train)
prediction = model.predict(x_test)

In [54]:
print(metrics.confusion_matrix(y_test,prediction))
print(metrics.accuracy_score(y_test,prediction))
print(metrics.confusion_matrix(y_test,prediction))

[[87 12]
 [28 27]]
0.7402597402597403
[[87 12]
 [28 27]]


#### RandomizedSearchCV

In [55]:
n_estimators = [int(x) for x in np.linspace(start=100,stop=3000,num=30)]
print(n_estimators)

[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000]


In [56]:
criterion = ['gini','entropy']
print(criterion)

['gini', 'entropy']


In [57]:
max_features = ['auto','sqrt','log2']
print(max_features)

['auto', 'sqrt', 'log2']


In [58]:
# max_depth = maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10,2000,20)]
print(max_depth)

[10, 114, 219, 324, 428, 533, 638, 743, 847, 952, 1057, 1162, 1266, 1371, 1476, 1581, 1685, 1790, 1895, 2000]


In [59]:
min_samples_split = [int(x) for x in range(-10,16)]
print(min_samples_split)

[-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


In [60]:
min_samples_leaf = [int(x) for x in range(-16,16)]
print(min_samples_leaf)

[-16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


In [61]:
random_grid = {'n_estimators' : n_estimators , 
              'max_features' : max_features ,
              'max_depth' : max_depth , 
              'min_samples_split' : min_samples_split , 
              'min_samples_leaf' : min_samples_leaf , 
              'criterion' : criterion}
print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 114, 219, 324, 428, 533, 638, 743, 847, 952, 1057, 1162, 1266, 1371, 1476, 1581, 1685, 1790, 1895, 2000], 'min_samples_split': [-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'min_samples_leaf': [-16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'criterion': ['gini', 'entropy']}


In [62]:
rf = RandomForestClassifier()
from sklearn.model_selection import RandomizedSearchCV
rf_randomcv = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=150,cv=4,verbose=True,random_state=100,n_jobs=-1)
rf_randomcv.fit(x_train,y_train)

Fitting 4 folds for each of 150 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   49.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.7min finished


RandomizedSearchCV(cv=4, estimator=RandomForestClassifier(), n_iter=150,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [10, 114, 219, 324, 428,
                                                      533, 638, 743, 847, 952,
                                                      1057, 1162, 1266, 1371,
                                                      1476, 1581, 1685, 1790,
                                                      1895, 2000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [-16, -15, -14, -13,
                                                             -12, -11, -10, -9,
                                                             -8, -7, -6, -5, -4,
                                                             -

In [63]:
rf_randomcv.best_params_

{'n_estimators': 2500,
 'min_samples_split': 14,
 'min_samples_leaf': 13,
 'max_features': 'log2',
 'max_depth': 1371,
 'criterion': 'gini'}

In [64]:
rf_randomcv.best_estimator_

RandomForestClassifier(max_depth=1371, max_features='log2', min_samples_leaf=13,
                       min_samples_split=14, n_estimators=2500)

In [65]:
best_random_grid = rf_randomcv.best_estimator_
best_random_grid

RandomForestClassifier(max_depth=1371, max_features='log2', min_samples_leaf=13,
                       min_samples_split=14, n_estimators=2500)

In [66]:
y_pred = best_random_grid.predict(x_test)

In [67]:
print(metrics.confusion_matrix(y_test,y_pred))
print(metrics.accuracy_score(y_test,y_pred))
print(metrics.classification_report(y_test,y_pred))

[[88 11]
 [27 28]]
0.7532467532467533
              precision    recall  f1-score   support

           0       0.77      0.89      0.82        99
           1       0.72      0.51      0.60        55

    accuracy                           0.75       154
   macro avg       0.74      0.70      0.71       154
weighted avg       0.75      0.75      0.74       154

