In [5]:
# Install bayesian-optimization package if not already installed
# !pip install bayesian-optimization

from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier as RFC
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold

In [6]:
def rfc_cv(n_estimators, min_samples_split, max_features, data, targets):
    """Random Forest cross validation.
    This function will instantiate a random forest classifier with
    parameters n_estimators, min_samples_split, and max_features.
    Combined with data and targets this will be used to perform cross-validation.
    The result of cross-validation is returned. Our goal is to find
    combinations of n_estimators, min_samples_split, and max_features that
    maximizes the mean cross-validation score.
    """
    estimator = RFC(
        n_estimators=int(n_estimators),
        min_samples_split=int(min_samples_split),
        max_features=max(min(max_features, 0.999), 1e-3),
        random_state=2
    )
    cval = cross_val_score(estimator, data, targets, scoring='accuracy', cv=4)
    return cval.mean()

In [7]:
def optimize_rfc(data, targets):
    """Apply Bayesian Optimization to Random Forest parameters."""
    def rfc_crossval(n_estimators, min_samples_split, max_features):
        """Wrapper of RandomForest cross-validation.
        Ensure n_estimators and min_samples_split are cast to integers.
        Ensure max_features is within (0, 1) range.
        """
        return rfc_cv(
            n_estimators=n_estimators,
            min_samples_split=min_samples_split,
            max_features=max_features,
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=rfc_crossval,
        pbounds={
            "n_estimators": (10, 250),
            "min_samples_split": (2, 25),
            "max_features": (0.1, 0.999),
        },
        random_state=1234,
        verbose=2
    )
    
    optimizer.maximize(n_iter=10)
    print("Final result:", optimizer.max)
    return optimizer

In [8]:
# Set seed for reproducibility
seed = 2017
np.random.seed(seed)

In [9]:
# Read the data in
df = pd.read_csv("diabetes.csv")
X = df.iloc[:, :8].values  # Independent variables
y = df['Outcome'].values  # Dependent variables

In [10]:
# Normalize
X = StandardScaler().fit_transform(X)

In [11]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

In [12]:
# Perform Bayesian Optimization
print("--- Optimizing Random Forest ---")
optimize_rfc(X_train, y_train)

--- Optimizing Random Forest ---
|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [39m1        [39m | [39m0.7486   [39m | [39m0.2722   [39m | [39m16.31    [39m | [39m115.1    [39m |
| [39m2        [39m | [39m0.7467   [39m | [39m0.806    [39m | [39m19.94    [39m | [39m75.42    [39m |
| [39m3        [39m | [39m0.7448   [39m | [39m0.3485   [39m | [39m20.44    [39m | [39m240.0    [39m |
| [35m4        [39m | [35m0.7579   [39m | [35m0.8875   [39m | [35m10.23    [39m | [35m130.2    [39m |
| [39m5        [39m | [39m0.7486   [39m | [39m0.7144   [39m | [39m18.39    [39m | [39m98.86    [39m |
| [39m6        [39m | [39m0.7504   [39m | [39m0.4713   [39m | [39m23.55    [39m | [39m96.06    [39m |
| [39m7        [39m | [39m0.7467   [39m | [39m0.638    [39m | [39m6.223    [39m | [39m160.9    [39m |
| [39m8        [39m | [39m0.7486   [39m | [39m0.5152  

<bayes_opt.bayesian_optimization.BayesianOptimization at 0x12c5f9970>