In [1]:

import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn.model_selection import*
import optuna
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
dataset = read_csv(url, header=None)
data = dataset.values
x_data, y_data = data[:, :-1],data[:, -1]
x_data = pd.DataFrame(x_data.astype('float64'))
#Label Encoding Target
le = preprocessing.LabelEncoder()
le.fit(y_data)
y_data = le.transform(y_data)
y_data = pd.DataFrame(y_data.astype('float64'))
##compare a validation set with the kfold output from optuna 
features, test_features, target, test_target = train_test_split(x_data, y_data, test_size=0.3, random_state=17)
#compare set
predictions_per_trial = dict()
def objective(trial):
    #Define Hyperparameter Search Space
    params_optuna = {
        "loss" :  trial.suggest_categorical("loss", ["log_loss", "exponential"]),
        "learning_rate" : trial.suggest_float("learning_rate", 0, 1),
        "n_estimators" : trial.suggest_int("n_estimators", 50, 1000),
        "subsample" : trial.suggest_float("subsample", 0, 1),
        "criterion" :  trial.suggest_categorical("criterion", ["friedman_mse", "squared_error"]),
        "min_samples_split" : trial.suggest_float("min_samples_split", 0, 1),
        "min_samples_leaf" : trial.suggest_float("min_samples_leaf", 0, 1),
        "max_depth" : trial.suggest_int("max_depth", 3, 50),
        "min_impurity_decrease" : trial.suggest_float("min_impurity_decrease", 0, 1)
    }
    
    #KFold
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    scores = []
    for train_index,test_index in skf.split(features,target):
        X_train, y_train = features.iloc[train_index],target.iloc[train_index]
        X_val, y_val = features.iloc[test_index],target.iloc[test_index]
    
        classifier = GradientBoostingClassifier(**params_optuna)
        classifier.fit(X_train, y_train.values.ravel())
    
        y_pred = classifier.predict_proba(X_val)[:,1]
        X_train_pred = classifier.predict_proba(X_train)[:,1]
        y_pred_test = classifier.predict_proba(test_features)[:,1]
        
        score = log_loss(y_val, y_pred.astype(np.float64))
        scores.append(score)
        #append to compare set
        try:
            predictions_per_trial[trial.number].append(y_pred_test)
        except KeyError:
            predictions_per_trial[trial.number] = [y_pred_test]
    
    return np.mean(scores)
study = optuna.create_study(study_name="GBC_SKFold", direction="minimize")
study.optimize(objective, n_trials=10)
study_df = study.trials_dataframe()
print(study.best_value)
print(study.best_params)


[I 2024-04-17 16:58:54,268] A new study created in memory with name: GBC_SKFold
[I 2024-04-17 16:58:55,931] Trial 0 finished with value: 0.7020325367423114 and parameters: {'loss': 'log_loss', 'learning_rate': 0.7058394955255263, 'n_estimators': 861, 'subsample': 0.3908800056440055, 'criterion': 'friedman_mse', 'min_samples_split': 0.2420829177681204, 'min_samples_leaf': 0.5584415181899345, 'max_depth': 35, 'min_impurity_decrease': 0.9344504575307848}. Best is trial 0 with value: 0.7020325367423114.
[I 2024-04-17 16:58:57,653] Trial 1 finished with value: 0.6953992449689483 and parameters: {'loss': 'exponential', 'learning_rate': 0.7036115370728723, 'n_estimators': 892, 'subsample': 0.4156282770518035, 'criterion': 'friedman_mse', 'min_samples_split': 0.6641083012779623, 'min_samples_leaf': 0.863649525682642, 'max_depth': 33, 'min_impurity_decrease': 0.907374954254645}. Best is trial 1 with value: 0.6953992449689483.
[I 2024-04-17 16:58:58,432] Trial 2 finished with value: 0.7039362600

0.4435909590351555
{'loss': 'exponential', 'learning_rate': 0.017993897984345253, 'n_estimators': 623, 'subsample': 0.19822337824608738, 'criterion': 'friedman_mse', 'min_samples_split': 0.06579724879252746, 'min_samples_leaf': 0.037237155972717484, 'max_depth': 10, 'min_impurity_decrease': 0.169449388843099}
