In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import warnings

In [2]:
# Suppress future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# Set seed for reproducibility
seed = 2017
np.random.seed(seed)

In [5]:
# Read the data in
df = pd.read_csv("diabetes.csv")
X = df.iloc[:, :8].values  # independent variables
y = df['Outcome'].values  # dependent variables

In [6]:
# Normalize
X = StandardScaler().fit_transform(X)

In [7]:
# Evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)

In [8]:
# Initialize RandomForestClassifier
clf_rf = RandomForestClassifier(random_state=seed)

In [9]:
# Define the parameter grid
rf_params = {
    'n_estimators': [100, 250, 500, 750, 1000],
    'criterion': ['gini', 'entropy'],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
    'max_depth': [1, 3, 5, 7, 9]
}

In [10]:
# Perform grid search with cross-validation
grid = GridSearchCV(clf_rf, rf_params, scoring='roc_auc', cv=kfold, verbose=10, n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV 3/5; 1/200] START criterion=gini, max_depth=1, max_features=None, n_estimators=100
[CV 2/5; 1/200] START criterion=gini, max_depth=1, max_features=None, n_estimators=100
[CV 1/5; 1/200] START criterion=gini, max_depth=1, max_features=None, n_estimators=100
[CV 4/5; 1/200] START criterion=gini, max_depth=1, max_features=None, n_estimators=100
[CV 3/5; 1/200] END criterion=gini, max_depth=1, max_features=None, n_estimators=100;, score=0.769 total time=   0.5s
[CV 2/5; 1/200] END criterion=gini, max_depth=1, max_features=None, n_estimators=100;, score=0.800 total time=   0.4s
[CV 1/5; 1/200] END criterion=gini, max_depth=1, max_features=None, n_estimators=100;, score=0.825 total time=   0.4s
[CV 4/5; 1/200] END criterion=gini, max_depth=1, max_features=None, n_estimators=100;, score=0.751 total time=   0.4s
[CV 5/5; 1/200] START criterion=gini, max_depth=1, max_features=None, n_estimators=100
[CV 1/5; 2/200] START criteri

250 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
175 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidP

In [11]:
# Output the best parameters
print('Best Parameters: ', grid.best_params_)

Best Parameters:  {'criterion': 'entropy', 'max_depth': 5, 'max_features': 'log2', 'n_estimators': 500}


In [12]:
# Evaluate the best estimator from the grid search
results = cross_val_score(grid.best_estimator_, X_train, y_train, cv=kfold)
print("Accuracy - Train CV: ", results.mean())
print("Accuracy - Train: ", metrics.accuracy_score(grid.best_estimator_.predict(X_train), y_train))
print("Accuracy - Test: ", metrics.accuracy_score(grid.best_estimator_.predict(X_test), y_test))

Accuracy - Train CV:  0.7522499134648667
Accuracy - Train:  0.8621973929236499
Accuracy - Test:  0.7965367965367965
