In [None]:
import neptune

neptune.init(
         project_qualified_name='arsde/J21-BDT'
)

neptune.create_experiment(name='skopt-xgb')

In [None]:
# !pip install sklearn==0.23.2

In [17]:
from skopt import BayesSearchCV
import pandas as pd
from skopt.space import Real, Integer

In [None]:
path='/mnt/cephfs/ml_data/mc_2021/'

data_real = pd.read_csv('{}processed_data/ProcessedTrainReal/ProcessedTrain_1M.csv.gz'.format(path))
data_real = data_real[data_real['edepR'] < 17.2]

In [None]:
size = int(1e6)
n_feats = len(data_real.columns)-5

X_val = data_real.iloc[:, :-5][size:]
y_val = data_real.iloc[:, -5][size:]

X = data_real.iloc[:, :-5][:size]
y = data_real.iloc[:, -5][:size]

In [None]:
import neptunecontrib.monitoring.skopt as skopt_utils
from xgboost import XGBRegressor
neptune_callback = skopt_utils.NeptuneCallback()

def print_status(optimal_result):
    """Shows the best parameters found and accuracy attained of the search so far."""
    models_tested = pd.DataFrame(tuner.cv_results_)
    best_parameters_so_far = pd.Series(tuner.best_params_)
    print(
        "Model #{}\nBest accuracy so far: {}\nBest parameters so far: {}\n".format(
            len(models_tested),
            np.round(tuner.best_score_, 3),
            tuner.best_params_,
        )
    )
    
    clf_type = tuner.estimator.__class__.__name__
    models_tested.to_csv("xgboost_optimize/" + clf_type + "_cv_results_summary.csv")

model = XGBRegressor(
    n_estimators=3000,
    random_state=22
)
    
params_space = {
    "learning_rate": Real(0.01, 1.0, "log-uniform"),
    "min_child_weight": Integer(0, 10),
    "max_depth": Integer(6, 14),
    "max_delta_step": Integer(0, 10),
    "subsample": Real(0.01, 1.0, "uniform"),
    "colsample_bytree": Real(0.01, 1.0, "log-uniform"),
    "colsample_bylevel": Real(0.01, 1.0, "log-uniform"),
    "reg_lambda": Real(1e-9, 1000, "log-uniform"),
    "reg_alpha": Real(1e-9, 1.0, "log-uniform"),
    "gamma": Real(1e-9, 0.5, "log-uniform"),
    "min_child_weight": Integer(0, 5),
}
    
tuner = BayesSearchCV(
    model,
    params_space,
    n_iter=50,
    cv=5,
    random_state=22,
    n_jobs=-1,
    verbose=10,
    scoring="neg_mean_absolute_percentage_error",
    fit_params={
        'eval_set': [(X_val, y_val)]
        'early_stopping_rounds':3
    }
)

results = tuner.fit(
    X, y,
    callbacks=[
        neptune_callback,
        print_status
    ]
)

In [None]:
skopt_utils.log_results(results._optim_results[0])

In [None]:
# !pip install sklearn==0.24