In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import pickle
from xgboost import XGBRegressor

import plotly.graph_objs as go
from plotly.subplots import make_subplots

import plotly.io as pio
pio.templates.default = 'plotly_white'

import warnings
warnings.filterwarnings("ignore")

In [2]:
opt_features = ['AccumCharge', 'rho_cc', 'pe_90p', 'R_cht', 'ht_55p',
                'pe_mean', 'ht_5p', 'pe_80p', 'pe_std', 'pe_70p', 'nPMTs']
opt_features += ['edep']

path = "/mnt/cephfs/ml_data/TAO_detsim_J22/"
data = pd.read_csv(f'{path}processed_data/ProcessedTrain/ProcessedTrain.csv.gz')

In [3]:
FC_cut = 0.65
N = int(1.15e6)

data = data.reset_index(drop=True)
data = data[data['edepR'] < FC_cut][opt_features][:N]
data.shape

(1150000, 12)

In [4]:
n_feats = len(data.columns)-1

In [5]:
n_feats

11

In [6]:
from sklearn.metrics import mean_squared_error

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [7]:
rand_indexes = np.random.randint(0, data.shape[0], int(0.1*data.shape[0]))
val_indexes = np.zeros(data.shape[0], bool)
val_indexes[rand_indexes] = 1

data_val = data[val_indexes]
data_train = data[np.logical_not(val_indexes)]

In [8]:
from sklearn.model_selection import KFold

n_folds = 4
kfold = KFold(n_folds, True, random_state=22)
trains = []
tests = []
for train, test in kfold.split(data_train):
    trains.append(np.array(data_train)[train])
    tests.append(np.array(data_train)[test])

In [9]:
scores_dict = {}
val_scores_dict = {}
n_estimators_dict = {}
test_scores_dict = {}

In [10]:
max_depths = range(5, 15)

In [11]:
data_val = np.array(data_val)

In [12]:
scores = []
mape_scores = []
val_scores = []
n_estimators = []

for max_depth in tqdm(max_depths, "Max depths: "):
    
    scores_dict[max_depth] = []
    val_scores_dict[max_depth] = []
    n_estimators_dict[max_depth] = []
    
    for i in tqdm(range(len(trains)), "Folds... ", leave=False):
        model = XGBRegressor(
                max_depth=max_depth,
                learning_rate=0.08,
                n_estimators=3000,
                tree_method='gpu_hist',
                random_state=22
        )

        model.fit(trains[i][:, :n_feats], trains[i][:, n_feats], verbose=False,
                   eval_set=[(data_val[:, :n_feats], data_val[:, n_feats])],
                   early_stopping_rounds=5)
        
        scores.append(mean_squared_error(tests[i][:, n_feats], model.predict(tests[i][:, :n_feats]))**0.5)        
        mape_scores.append(mean_absolute_percentage_error(tests[i][:, n_feats]), model.predict(tests[i][:, :n_feats]))
        
        val_scores.append(model.evals_result_['validation_0']['rmse'][-1])
        
        n_estimators.append(model.best_ntree_limit)
        n_estimators_dict[max_depth].append(model.best_ntree_limit)
        
        val_scores_dict[max_depth].append(model.evals_result_['validation_0']['rmse'][-1])
        scores_dict[max_depth].append(mean_squared_error(tests[i][:, n_feats], model.predict(tests[i][:, :n_feats]))**0.5)

Max depths:   0%|          | 0/10 [00:00<?, ?it/s]

Folds... :   0%|          | 0/4 [00:00<?, ?it/s]



XGBoostError: [17:04:56] ../src/gbm/gbtree.cc:611: Check failed: common::AllVisibleGPUs() >= 1 (0 vs. 1) : No visible GPU is found for XGBoost.
Stack trace:
  [bt] (0) /home/arsde/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x29da59) [0x7fd7c39a2a59]
  [bt] (1) /home/arsde/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x29ed02) [0x7fd7c39a3d02]
  [bt] (2) /home/arsde/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x29f0ba) [0x7fd7c39a40ba]
  [bt] (3) /home/arsde/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x2ddccd) [0x7fd7c39e2ccd]
  [bt] (4) /home/arsde/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x70) [0x7fd7c3833000]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.7(+0x6ff5) [0x7fd8dcde4ff5]
  [bt] (6) /lib/x86_64-linux-gnu/libffi.so.7(+0x640a) [0x7fd8dcde440a]
  [bt] (7) /usr/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(_ctypes_callproc+0x5b6) [0x7fd8dbe8b316]
  [bt] (8) /usr/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(+0x139ec) [0x7fd8dbe8b9ec]



In [None]:
scores_ = np.array(scores).reshape((len(max_depths), n_folds)).mean(axis=1)
mape_scores_ = np.array(mape_scores).reshape((len(max_depths), n_folds)).mean(axis=1)

scores_std = np.array(scores).reshape((len(max_depths), n_folds)).std(axis=1)
mape_scores_std = np.array(mape_scores).reshape((len(max_depths), n_folds)).std(axis=1)

In [None]:
val_scores_ = np.array(val_scores).reshape((len(max_depths), n_folds)).mean(axis=1)
val_scores_std = np.array(val_scores).reshape((len(max_depths), n_folds)).std(axis=1)

In [None]:
n_estimators_ = np.array(n_estimators).reshape((len(max_depths), n_folds)).mean(axis=1)
n_estimators_std = np.array(n_estimators).reshape((len(max_depths), n_folds)).std(axis=1)

In [None]:
df = pd.DataFrame([mape_scores_, mape_scores_std, scores_, 
                   scores_std, val_scores_, val_scores_std,
                   n_estimators_, n_estimators_std]).T

df.columns = ['mape_scores_', 'mape_scores_std', 'scores_',
              'scores_std', 'val_scores_', 'val_scores_std',
              'n_estimators_', 'n_estimators_std']

df['max_depth'] = np.array(max_depths)

In [None]:
df.to_csv('grid_search_results_opt.csv', index=False)

In [None]:
df = pd.read_csv('grid_search_results_opt.csv')

In [None]:
scores_ = df.scores_
scores_std = df.scores_std

mape_scores_ = df.mape_scores_
mape_scores_std = df.mape_scores_std

val_scores_ = df.val_scores_
val_scores_std = df.val_scores_std

n_estimators_ = df.n_estimators_
n_estimators_std = df.n_estimators_std

max_depths = df.max_depth

In [None]:
list_max_depths = [str(max_depths[i]) for i in range(len(max_depths))]

In [None]:
drmse = (scores_.max()*100 - scores_.min()*100) / len(scores_*100)

In [None]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
            go.Scatter(
                x = np.array(max_depths),
                y = n_estimators_,
                name = 'Number of trees',
                mode='markers',
                marker=dict(
                    size=scores_*150,
                    color='darkblue',
                    opacity=0.7
                ),
                error_y=dict(
                    type='data',
                    width=10,
                    array=n_estimators_std,
                    visible=True
                ),
            ),
            secondary_y=False
)

fig.add_trace(
        go.Scatter(
            x = np.array(max_depths),
            y = np.array(scores_)*1000,
            mode='markers',
            name='RMSE',
            marker=dict(
                size=scores_*150,
                symbol='cross',
                color='darkred',
                opacity=0.7
            ),
            error_y=dict(
                type='data',
                width=12,
                array=scores_std,
                visible=True
            ),
        ),
        secondary_y=True
)
        
fig.update_xaxes(
     showline=True,
     title_text="Maximal depth of tree",
     ticks='outside',
     mirror=True,
     linecolor='black',
     gridcolor='grey',
)

fig.update_yaxes(
     showline=True,
     title_text="Number of trees",
     secondary_y=False,
     color = 'darkblue',
     tickmode = 'linear',
     tick0 = 0,
     dtick = 500,
     ticks='outside',
     mirror=True,
     linecolor='black',
     gridcolor='grey',
)

fig.update_yaxes(
    showline=True,
    title_text="RMSE, KeV",
    secondary_y=True,
    color = 'darkred',
    tickmode = 'array',
#tickvals = [8.600, 8.625, 8.650, 8.675, 8.700, 8.725, 8.750, 8.775, 8.800],
    ticks='outside',
    mirror=True,
    linecolor='black',
    showgrid=False
)

fig.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 3,
        dtick = 1
    ),
    legend=dict(
        title_font_family="Times New Roman",
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.925,
        bordercolor="Black",
        borderwidth=1,
        font=dict(
            family="Times New Roman",
            color="black"
        ),
    ),
    showlegend=True,
    font=dict(
            family="Times New Roman",
            color='black',
            size=22,
    )
)

pio.write_image(fig, 'plots/BDT_grid_search_opt.pdf', width=1000, height=600, scale=1)
fig.show()

In [None]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
            go.Scatter(
                x = np.array(max_depths),
                y = n_estimators_,
                name = 'Number of trees',
                mode='markers',
                marker=dict(
                    size=scores_*150,
                    color='darkblue',
                    opacity=0.7
                ),
                error_y=dict(
                    type='data',
                    width=10,
                    array=n_estimators_std,
                    visible=True
                ),
            ),
            secondary_y=False
)

fig.add_trace(
        go.Scatter(
            x = np.array(max_depths),
            y = np.array(mape_scores_),
            mode='markers',
            name='MAPE',
            marker=dict(
                size=mape_scores_*11,
                symbol='x',
                color='darkred',
                opacity=0.7
            ),
            error_y=dict(
                type='data',
                width=12,
                array=mape_scores_std,
                visible=True
            ),
        ),
        secondary_y=True
)

fig.update_xaxes(
    showline=True,
    title_text="Maximal depth of tree",
    ticks='outside',
    mirror=True,
    linecolor='black',
    gridcolor='grey',
)

fig.update_yaxes(
    showline=True,
    title_text="Number of trees",
    secondary_y=False,
    color='darkblue',
    tickmode='linear',
    tick0=0,
    dtick=500,
    ticks='outside',
    mirror=True,
    linecolor='black',
    gridcolor='grey',
)

fig.update_yaxes(
    showline=True,
    title_text="MAPE, %",
    secondary_y=True,
    color = 'darkred',
    tickmode = 'array',
    #tickvals = [8.600, 8.625, 8.650, 8.675, 8.700, 8.725, 8.750, 8.775, 8.800],
    ticks='outside', 
    mirror=True,
    linecolor='black',
    showgrid=False,
)

fig.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 3,
        dtick = 1
    ),
    legend=dict(
        title_font_family="Times New Roman",
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.925,
        bordercolor="Black",
        borderwidth=1,
        font=dict(
            family="Times New Roman",
            color="black",
            size=24,
        ),
    ),
    showlegend=True,
    font=dict(
            family="Times New Roman",
            color='black',
            size=24,
    )
)

pio.write_image(fig, 'plots/BDT_grid_search_mape_opt.pdf', width=1000, height=600, scale=1)
fig.show()