In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import plotly.graph_objs as go
from plotly.subplots import make_subplots

import plotly.io as pio
pio.templates.default = 'plotly_white'

import warnings
warnings.filterwarnings("ignore")

In [2]:
vm = input()
if vm.lower()=='yes':
    vm=True
else:
    vm=False

yes


In [3]:
if vm:
    path='/mnt/cephfs/ml_data/mc_2021/processed_data/ProcessedTrainReal/'
    data_real = pd.read_csv('{}ProcessedTrain_1M.csv.gz'.format(path))
else:
    data_real = pd.read_csv('processed_data/ProcessedTrainReal/ProcessedTrain_1M.csv.gz')

data_real = data_real[data_real['edepR'] < 17.2]

In [4]:
n_feats = len(data_real.columns) - 5

In [5]:
target_energy = data_real['edep']

In [6]:
n_feats

91

In [7]:
size = int(1e6)

In [8]:
val = data_real[size:]
val_en = target_energy[size:]

data_real = data_real[:size]
target_energy = target_energy[:size]

In [9]:
import pickle
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [10]:
from sklearn.model_selection import KFold
n_folds = 5
kfold = KFold(n_folds, True, random_state=22)
trains = []
tests = []
for train, test in kfold.split(data_real):
    trains.append(np.array(data_real)[train])
    tests.append(np.array(data_real)[test])

In [11]:
from sklearn.metrics import mean_squared_error

In [12]:
scores_dict = {}
val_scores_dict = {}
n_estimators_dict = {}
test_scores_dict = {}

In [13]:
max_depths = range(5, 13)

In [14]:
val = np.array(val)
val_en = np.array(val_en)

In [17]:
scores = []
mape_scores = []
val_scores = []
n_estimators = []

for max_depth in tqdm(max_depths, "Max depths: "):
    
    scores_dict[max_depth] = []
    val_scores_dict[max_depth] = []
    n_estimators_dict[max_depth] = []
    
    for i in tqdm(range(len(trains)), "Folds... ", leave=False):
        model = LGBMRegressor(
                max_depth=max_depth,
                learning_rate=0.08,
                n_estimators=3000,
        )

        model.fit(trains[i][:, :n_feats], trains[i][:, n_feats], verbose=False,
                   eval_set=[(val[:, :n_feats], val_en)],
                   early_stopping_rounds=5)
        
        scores.append(mean_squared_error(model.predict(tests[i][:, :n_feats]), tests[i][:, n_feats])**0.5)        
        mape_scores.append(mean_absolute_percentage_error(model.predict(tests[i][:, :n_feats]), tests[i][:, n_feats]))
        
        val_scores.append(model.evals_result_['valid_0']['l2'][-1])#['validation_0']['rmse'][-1])
        
        n_estimators.append(len(model.evals_result_['valid_0']['l2']))#(model.best_ntree_limit)
        n_estimators_dict[max_depth].append(len(model.evals_result_['valid_0']['l2']))#(model.best_ntree_limit)
        
        val_scores_dict[max_depth].append(model.evals_result_['valid_0']['l2'][-1])#['validation_0']['rmse'][-1])
        scores_dict[max_depth].append(mean_squared_error(model.predict(tests[i][:, :n_feats]), tests[i][:, n_feats])**0.5)

Max depths:   0%|          | 0/8 [00:00<?, ?it/s]

Folds... :   0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
scores_ = np.array(scores).reshape((len(max_depths), n_folds)).mean(axis=1)
mape_scores_ = np.array(mape_scores).reshape((len(max_depths), n_folds)).mean(axis=1)

scores_std = np.array(scores).reshape((len(max_depths), n_folds)).std(axis=1)
mape_scores_std = np.array(mape_scores).reshape((len(max_depths), n_folds)).std(axis=1)

In [None]:
val_scores_ = np.array(val_scores).reshape((len(max_depths), n_folds)).mean(axis=1)
val_scores_std = np.array(val_scores).reshape((len(max_depths), n_folds)).std(axis=1)

In [None]:
n_estimators_ = np.array(n_estimators).reshape((len(max_depths), n_folds)).mean(axis=1)
n_estimators_std = np.array(n_estimators).reshape((len(max_depths), n_folds)).std(axis=1)

In [None]:
df = pd.DataFrame([mape_scores_, mape_scores_std, scores_, 
                   scores_std, val_scores_, val_scores_std,
                   n_estimators_, n_estimators_std]).T

df.columns = ['mape_scores_', 'mape_scores_std', 'scores_',
              'scores_std', 'val_scores_', 'val_scores_std',
              'n_estimators_', 'n_estimators_std']

df['max_depth'] = np.array(max_depths)

In [None]:
df.to_csv('grid_search_results_lgbm.csv', index=False)

In [None]:
df = pd.read_csv('grid_search_results_lgbm.csv')

In [None]:
scores_ = df.scores_
scores_std = df.scores_std

mape_scores_ = df.mape_scores_
mape_scores_std = df.mape_scores_std

val_scores_ = df.val_scores_
val_scores_std = df.val_scores_std

n_estimators_ = df.n_estimators_
n_estimators_std = df.n_estimators_std

max_depths = df.max_depth

In [None]:
list_max_depths = [str(max_depths[i]) for i in range(len(max_depths))]

In [None]:
drmse = (scores_.max()*100 - scores_.min()*100) / len(scores_*100)

In [None]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
                go.Scatter(
                    x = np.array(max_depths),
                    y = n_estimators_, name = 'Number of trees',
                    mode='markers',
                    marker=dict(size=scores_*200, color='darkblue', opacity=0.75)
            ),
                secondary_y=False)

fig.add_trace(
        go.Scatter(
            x = np.array(max_depths),
            y = np.array(scores_)*1000, mode='markers', name='RMSE', marker=dict(size=scores_*200, symbol=3,
                                         color='darkred', opacity=0.75  
                                                    )),
        secondary_y=True)

# fig.add_trace(go.Scatter(x=n_estimators_, y=np.array(max_depths), mode='markers',
#                                 text=scores_,  marker=dict(size=scores_*250,
#                                          color=scores_,    
#                                          colorscale='sunset_r'), ))
        
fig.update_xaxes(showline=True, title_text="Maximal depth of tree", ticks='outside', mirror=True, linecolor='black')
fig.update_yaxes(showline=True, title_text="Number of trees", secondary_y=False, color = 'darkblue', tickmode = 'linear',
        tick0 = 0,
        dtick = 250, ticks='outside', mirror=True, linecolor='black')
fig.update_yaxes(showline=True, title_text="RMSE, KeV", secondary_y=True, color = 'darkred',
        tickmode = 'array',
        #tickvals = [8.600, 8.625, 8.650, 8.675, 8.700, 8.725, 8.750, 8.775, 8.800],
                ticks='outside', mirror=True, linecolor='black', showgrid=False)

fig.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 3,
        dtick = 1
    ),
    legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.925,bordercolor="Black",
        borderwidth=1
),
    #legend=dict(
    #orientation="h",
    #yanchor="bottom",
    #y=1.02,
    #xanchor="right",
    #x=1),
showlegend=True, font=dict(
            size=15,))

pio.write_image(fig, 'plots/BDT_grid_search_lgbm.pdf', width=950, height=600, scale=1)

fig.show()

In [None]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
                go.Scatter(
                    x = np.array(max_depths),
                    y = n_estimators_, name = 'Number of trees',
                    mode='markers',
                    marker=dict(size=scores_*200, color='darkblue', opacity=0.75)
            ),
                secondary_y=False)

fig.add_trace(
        go.Scatter(
            x = np.array(max_depths),
            y = np.array(mape_scores_), mode='markers', name='MAPE', marker=dict(size=mape_scores_*15, symbol=3,
                                         color='darkred', opacity=0.75  
                                                    )),
        secondary_y=True)

fig.update_xaxes(showline=True, title_text="Maximal depth of tree", ticks='outside', mirror=True, linecolor='black')
fig.update_yaxes(showline=True, title_text="Number of trees", secondary_y=False, color = 'darkblue',  tickmode = 'linear',
        tick0 = 0,
        dtick = 250, ticks='outside', mirror=True, linecolor='black')
fig.update_yaxes(showline=True, title_text="MAPE", secondary_y=True, color = 'darkred',
        tickmode = 'array',
        #tickvals = [8.600, 8.625, 8.650, 8.675, 8.700, 8.725, 8.750, 8.775, 8.800],
                ticks='outside', mirror=True, linecolor='black', showgrid=False)

fig.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 3,
        dtick = 1
    ),
    legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.925,bordercolor="Black",
        borderwidth=1
),
    #legend=dict(
    #orientation="h",
    #yanchor="bottom",
    #y=1.02,
    #xanchor="right",
    #x=1),
showlegend=True, font=dict(
            size=15,))

pio.write_image(fig, 'plots/BDT_grid_search_mape_lgbm.pdf', width=950, height=600, scale=1)

fig.show()