# Don't use Grid search or randomized search instead use optuna or genetic 

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier


In [22]:
import warnings
warnings.filterwarnings("ignore")

In [23]:
df = pd.read_csv("diabetes.csv")
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [25]:
df["Insulin"]=np.where(df["Insulin"]==0,df["Insulin"].median(), df["Insulin"])
df["Glucose"] = np.where(df["Glucose"]==0, df["Glucose"].median(),df["Glucose"])

In [26]:
X= df.drop(columns="Outcome")
y=df["Outcome"]

In [27]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [28]:
RForest = RandomForestClassifier()
RForest.fit(X_train,y_train)


RandomForestClassifier()

In [29]:
RFpred = RForest.predict(X_test)

In [30]:
print(classification_report(y_test,RFpred))

              precision    recall  f1-score   support

           0       0.86      0.89      0.88       107
           1       0.73      0.68      0.70        47

    accuracy                           0.82       154
   macro avg       0.80      0.78      0.79       154
weighted avg       0.82      0.82      0.82       154



### RandomizersearchCV and GridsearchCV

In [31]:
n_estimators = list(range(200,2000,200)) #[int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt','log2']
max_depth = list(range(10,1000,100)) #[int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 110, 210, 310, 410, 510, 610, 710, 810, 910], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [32]:
rf = RandomForestClassifier()
rfrandomcv = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, cv=3,
                                n_iter=10,n_jobs=-1,verbose=2, random_state=33)
rfrandomcv.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 110, 210, 310, 410,
                                                      510, 610, 710, 810, 910],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 5, 10, 14],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800]},
                   random_state=33, verbose=2)

In [33]:
rfrandomcv.best_estimator_

RandomForestClassifier(max_depth=310, min_samples_leaf=8, min_samples_split=10,
                       n_estimators=1400)

In [34]:
rfrandomcv.best_params_

{'n_estimators': 1400,
 'min_samples_split': 10,
 'min_samples_leaf': 8,
 'max_features': 'auto',
 'max_depth': 310,
 'criterion': 'gini'}

### Grid Search

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': [rfrandomcv.best_params_['criterion']],
    'max_depth': [rfrandomcv.best_params_['max_depth']],
    'max_features': [rfrandomcv.best_params_['max_features']],
    'min_samples_leaf': [rfrandomcv.best_params_['min_samples_leaf'], 
                         rfrandomcv.best_params_['min_samples_leaf']+2, 
                         rfrandomcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rfrandomcv.best_params_['min_samples_split'] - 2,
                          rfrandomcv.best_params_['min_samples_split'] - 1,
                          rfrandomcv.best_params_['min_samples_split'], 
                          rfrandomcv.best_params_['min_samples_split'] +1,
                          rfrandomcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [rfrandomcv.best_params_['n_estimators'] - 200, rfrandomcv.best_params_['n_estimators'] - 100, 
                     rfrandomcv.best_params_['n_estimators'], 
                     rfrandomcv.best_params_['n_estimators'] + 100, rfrandomcv.best_params_['n_estimators'] + 200]
}

print(param_grid)

{'criterion': ['entropy'], 'max_depth': [10], 'max_features': ['auto'], 'min_samples_leaf': [2, 4, 6], 'min_samples_split': [8, 9, 10, 11, 12], 'n_estimators': [600, 700, 800, 900, 1000]}


In [16]:
#### Fit the grid_search to the data
rf=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=5,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)


Fitting 5 folds for each of 75 candidates, totalling 375 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['entropy'], 'max_depth': [10],
                         'max_features': ['auto'],
                         'min_samples_leaf': [2, 4, 6],
                         'min_samples_split': [8, 9, 10, 11, 12],
                         'n_estimators': [600, 700, 800, 900, 1000]},
             verbose=2)

In [17]:
grid_search.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=10, min_samples_leaf=6,
                       min_samples_split=9, n_estimators=900)

In [18]:
best_grid=grid_search.best_estimator_

In [20]:
y_pred = best_grid.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.77      0.88      0.82        99
           1       0.71      0.53      0.60        55

    accuracy                           0.75       154
   macro avg       0.74      0.70      0.71       154
weighted avg       0.75      0.75      0.74       154



### Optimize hyperparameters of the model using Optuna

The hyperparameters of the above algorithm are `n_estimators` and `max_depth` for which we can try different values to see if the model accuracy can be improved. The `objective` function is modified to accept a trial object. This trial has several methods for sampling hyperparameters. We create a study to run the hyperparameter optimization and finally read the best hyperparameters.

In [35]:
pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
     -------------------------------------- 380.1/380.1 kB 4.7 MB/s eta 0:00:00
Collecting colorlog
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.13.2-py3-none-any.whl (232 kB)
     -------------------------------------- 233.0/233.0 kB 7.2 MB/s eta 0:00:00
Collecting Mako
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
     ---------------------------------------- 78.6/78.6 kB ? eta 0:00:00
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.5 alembic-1.13.2 colorlog-6.8.2 optuna-3.6.1
Note: you may need to restart the kernel to use updated packages.




In [37]:
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
def objective(trial):

    classifier = trial.suggest_categorical('classifier', ['RandomForest', 'SVC'])
    
    if classifier == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 200, 2000,10)
        max_depth = int(trial.suggest_float('max_depth', 10, 100, log=True))

        clf = RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth)
    else:
        c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)
        
        clf = SVC(C=c, gamma='auto')

    return cross_val_score(
        clf,X_train,y_train, n_jobs=-1, cv=3).mean()


In [38]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[I 2024-07-30 22:52:24,537] A new study created in memory with name: no-name-d78e2d6b-1437-4eb2-b7ec-413acff5b100
[I 2024-07-30 22:52:28,510] Trial 0 finished with value: 0.7426510441575004 and parameters: {'classifier': 'RandomForest', 'n_estimators': 450, 'max_depth': 16.883285233816736}. Best is trial 0 with value: 0.7426510441575004.
[I 2024-07-30 22:52:29,629] Trial 1 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', 'svc_c': 76480.79522049085}. Best is trial 0 with value: 0.7426510441575004.
[I 2024-07-30 22:52:30,600] Trial 2 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', 'svc_c': 0.0003435850646260416}. Best is trial 0 with value: 0.7426510441575004.
[I 2024-07-30 22:52:30,658] Trial 3 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', 'svc_c': 198733292.4538164}. Best is trial 0 with value: 0.7426510441575004.
[I 2024-07-30 22:52:30,708] Trial 4 finished with value: 0.640068547744301 and paramet

[I 2024-07-30 22:54:43,334] Trial 37 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', 'svc_c': 1.3366280893522467e-08}. Best is trial 20 with value: 0.7540650406504065.
[I 2024-07-30 22:54:47,517] Trial 38 finished with value: 0.7524469950581859 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1090, 'max_depth': 14.89887173153547}. Best is trial 20 with value: 0.7540650406504065.
[I 2024-07-30 22:54:47,626] Trial 39 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', 'svc_c': 41.51696027596643}. Best is trial 20 with value: 0.7540650406504065.
[I 2024-07-30 22:54:50,289] Trial 40 finished with value: 0.7491790212019768 and parameters: {'classifier': 'RandomForest', 'n_estimators': 650, 'max_depth': 15.442067598999373}. Best is trial 20 with value: 0.7540650406504065.
[I 2024-07-30 22:54:54,228] Trial 41 finished with value: 0.7524390243902439 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1070, 'max_dept

[I 2024-07-30 22:57:32,369] Trial 74 finished with value: 0.7491710505340348 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1280, 'max_depth': 34.454196364528336}. Best is trial 57 with value: 0.7589430894308943.
[I 2024-07-30 22:57:37,253] Trial 75 finished with value: 0.7556830862426271 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1150, 'max_depth': 41.17261349614281}. Best is trial 57 with value: 0.7589430894308943.
[I 2024-07-30 22:57:43,237] Trial 76 finished with value: 0.7459030766778256 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1430, 'max_depth': 42.17100213037251}. Best is trial 57 with value: 0.7589430894308943.
[I 2024-07-30 22:57:47,811] Trial 77 finished with value: 0.7475530049418141 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1170, 'max_depth': 37.64200653389861}. Best is trial 57 with value: 0.7589430894308943.
[I 2024-07-30 22:57:52,988] Trial 78 finished with value: 0.7491790212019768 and parame

Accuracy: 0.7589430894308943
Best hyperparameters: {'classifier': 'RandomForest', 'n_estimators': 1340, 'max_depth': 32.044505677899814}


In [39]:
study.best_params

{'classifier': 'RandomForest',
 'n_estimators': 1340,
 'max_depth': 32.044505677899814}

In [40]:
rf=RandomForestClassifier(n_estimators=1340,max_depth=32.044505677899814)
rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=32.044505677899814, n_estimators=1340)

In [41]:
y_pred=rf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[94 13]
 [14 33]]
0.8246753246753247
              precision    recall  f1-score   support

           0       0.87      0.88      0.87       107
           1       0.72      0.70      0.71        47

    accuracy                           0.82       154
   macro avg       0.79      0.79      0.79       154
weighted avg       0.82      0.82      0.82       154

