In [2]:
#Import relevant packages
import pandas as pd
import numpy as np
from bayes_opt import BayesianOptimization
import optuna
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
#Load balanced dataset
file_path = 'diabetes_binary_5050split_health_indicators_BRFSS2015.csv'
data = pd.read_csv(file_path)

#Define selected features and target variable
selected_features = ['HighBP', 'HighChol', 'Stroke', 'HeartDiseaseorAttack', 
                     'PhysActivity', 'DiffWalk', 'BMI', 'GenHlth', 'Age', 'Income']
target_variable = 'Diabetes_binary'  
X = data[selected_features]
y = data[target_variable]

#80/20 train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
best_results = {}

#Optuna objective function
def objective(trial):
    model_name = trial.suggest_categorical("model", ["RandomForest", "DecisionTree", "XGBoost", "LogisticRegression"])
    if model_name == "RandomForest":
        n_estimators = trial.suggest_int("n_estimators", 50, 300)
        max_depth = trial.suggest_int("max_depth", 2, 20)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
        bootstrap = trial.suggest_categorical("bootstrap", [True, False])
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=100,
        )
    
    elif model_name == "DecisionTree":
        max_depth = trial.suggest_int("max_depth", 2, 20)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
        criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
        model = DecisionTreeClassifier(
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            criterion=criterion,
            random_state=100,
        )

    elif model_name == "XGBoost":
        n_estimators = trial.suggest_int("n_estimators", 50, 300)
        max_depth = trial.suggest_int("max_depth", 2, 20)
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
        subsample = trial.suggest_float("subsample", 0.5, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)
        model = XGBClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            random_state=100,
            use_label_encoder=False,
            eval_metric="logloss",
        )

    elif model_name == "LogisticRegression":
        C = trial.suggest_float("C", 0.01, 10.0, log=True)
        penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
        solver = trial.suggest_categorical("solver", ["liblinear", "saga"])
        model = LogisticRegression(
            C=C,
            penalty=penalty,
            solver=solver,
            random_state=100,
            max_iter=1000,
        )

    #Explore 3 CV folds for each model 
    scoring = {"accuracy": "accuracy", "f1": make_scorer(f1_score)}
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring="accuracy")
    f1_scores = cross_val_score(model, X_train, y_train, cv=3, scoring=make_scorer(f1_score))
    mean_accuracy = scores.mean()
    mean_f1 = f1_scores.mean()

    if model_name not in best_results or mean_accuracy > best_results[model_name]["accuracy"]:
        best_results[model_name] = {
            "accuracy": mean_accuracy,
            "f1": mean_f1,
            "hyperparameters": trial.params,
        }

    #Need to return optuna maximized hyperparameters and best model
    return mean_accuracy 

#Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

#Display the best results for each model
print("Optimal Results for Each Model:")
for model_name, results in best_results.items():
    print("Model:", model_name)
    print("Best Accuracy:", results['accuracy'])
    print("Best Hyperparams:", results['hyperparameters'])