In [5]:

import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn.model_selection import*
import optuna
import xgboost as xgb
from sklearn.metrics import log_loss
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
# load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
dataset = read_csv(url, header=None)
data = dataset.values
x_data, y_data = data[:, :-1],data[:, -1]
x_data = pd.DataFrame(x_data.astype('float64'))
#Label Encoding Target
le = preprocessing.LabelEncoder()
le.fit(y_data)
y_data = le.transform(y_data)
y_data = pd.DataFrame(y_data.astype('float64'))
##compare a validation set with the kfold output from optuna 
features, test_features, target, test_target = train_test_split(x_data, y_data, test_size=0.3, random_state=17)
#compare set
predictions_per_trial = dict()
def objective(trial):
    #Define Hyperparameter Search Space
    params_optuna = {
        "tree_method" : "gpu_hist",
        "n_estimators" : trial.suggest_int("n_estimators", 50, 1000),
        "max_depth" : trial.suggest_int("max_depth", 3, 19),
        "max_leaves" : trial.suggest_int("max_leaves", 15, 149),
        "grow_policy" : trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "learning_rate" : trial.suggest_float("learning_rate", 0, 1),
        "booster" : trial.suggest_categorical("booster", ["gbtree"]),
        "reg_alpha" : trial.suggest_float("reg_alpha", 0, 1),
        "reg_lambda" : trial.suggest_float("reg_lambda", 0, 1),
        "gamma" : trial.suggest_float("gamma", 0, 10),
        "min_child_weight" : trial.suggest_float("min_child_weight", 0, 20),
        "subsample" : trial.suggest_float("subsample", 0.5, 0.9),
        'verbose' : -100 #leave out verbose parameter to show INFO logs
    }
    
    #KFold
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    scores = []
    for train_index,test_index in skf.split(features,target):
        X_train, y_train = features.iloc[train_index],target.iloc[train_index]
        X_val, y_val = features.iloc[test_index],target.iloc[test_index]
    
        classifier = xgb.XGBClassifier(**params_optuna)
        classifier.fit(X_train, y_train)
    
        y_pred = classifier.predict_proba(X_val)[:,1]
        y_pred_test = classifier.predict_proba(test_features)[:,1]
        
        score = log_loss(y_val, y_pred.astype(np.float64))
        scores.append(score)
        #append to compare set
        try:
            predictions_per_trial[trial.number].append(y_pred_test)
        except KeyError:
            predictions_per_trial[trial.number] = [y_pred_test]
    
    return np.mean(scores)
study = optuna.create_study(study_name="XGB_SKFold", direction="minimize")
# Suppress INFO logs by setting the logging level to WARNING
optuna.logging.set_verbosity(optuna.logging.ERROR)
study.optimize(objective, n_trials=10)
study_df_xgb = study.trials_dataframe()
print(study.best_value)
print(study.best_params)
xgb_params = study.best_params

0.44018990798739377
{'n_estimators': 446, 'max_depth': 7, 'max_leaves': 83, 'grow_policy': 'lossguide', 'learning_rate': 0.22803681838088785, 'booster': 'gbtree', 'reg_alpha': 0.8743861992997275, 'reg_lambda': 0.003278029660373938, 'gamma': 2.6086509199523267, 'min_child_weight': 4.119215481130636, 'subsample': 0.8614638337011203}
