In [1]:
import neptune.new as neptune

run = neptune.init(project='arsde/J22TAO')

https://app.neptune.ai/arsde/J22TAO/e/JTAO-63


Info (NVML): RM has detected an NVML/RM version mismatch.. GPU usage metrics may not be reported. For more information, see https://docs.neptune.ai/you-should-know/what-can-you-log-and-display#hardware-consumption


Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

In [3]:
features_selection = True

In [4]:
PARAMS = {
    "FC_cut": 0.65,
    "number_of_events": int(1.15e6),
    "random_state": 22,
    "max_depth": 9,
    "early_stopping_rounds": 10,
    "n_folds": 10,  
    "training_dataset_size": int(1e6)
}

run['PARAMS'] = PARAMS

In [5]:
path = "/mnt/cephfs/ml_data/TAO_detsim_J22/"
data = pd.read_csv(f'{path}processed_data/ProcessedTrain/ProcessedTrain.csv.gz')

In [6]:
from sklearn.utils import shuffle

FC_cut = PARAMS['FC_cut']
N = PARAMS['number_of_events']
random_state = PARAMS['random_state']
max_depth = PARAMS['max_depth']
early_stopping_rounds = PARAMS['early_stopping_rounds']
n_folds = PARAMS['n_folds']
size = PARAMS['training_dataset_size']

data = shuffle(data, random_state=random_state)
data = data.reset_index(drop=True)
data = data[data['edepR'] < FC_cut]
data = data[:N]
data.head()

Unnamed: 0,AccumCharge,nPMTs,R_cc,rho_cc,x_cc,y_cc,z_cc,gamma_z_cc,gamma_y_cc,gamma_x_cc,...,pe_75p,pe_80p,pe_85p,pe_90p,pe_95p,edep,edepX,edepY,edepZ,edepR
0,35482.0,3999.0,0.365263,0.36429,0.338427,0.134812,-0.026646,-0.073146,0.397121,2.462711,...,10.0,12.0,15.0,18.0,27.0,8.520335,0.468565,0.200681,-0.033771,0.510849
1,26449.0,3881.0,0.436751,0.369734,0.056378,-0.36541,-0.232483,-0.628785,-1.527498,0.130175,...,7.0,9.0,11.0,15.0,23.0,6.394049,0.079346,-0.524534,-0.34303,0.631744
3,6304.0,2817.0,0.32889,0.312968,-0.125881,-0.286536,-0.101091,-0.323008,-1.774786,-0.414293,...,3.0,3.0,4.0,4.0,6.0,1.668662,-0.164486,-0.40859,-0.155663,0.467154
9,13865.0,3606.0,0.347917,0.191859,0.088948,-0.169994,-0.290236,-1.512758,-0.560001,0.264448,...,5.0,5.0,7.0,8.0,11.0,3.526859,0.123386,-0.231287,-0.4261,0.500279
10,40302.0,4007.0,0.368905,0.364372,0.19219,0.309564,-0.057652,-0.158224,1.542799,0.610346,...,12.0,14.0,17.0,22.0,30.0,9.680871,0.268719,0.425487,-0.085745,0.510491


In [7]:
from sklearn.metrics import mean_squared_error
from neptune.new.types import File
import plotly.express as px
import plotly.io as pio
pio.templates.default = 'plotly_white'

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def plot_results(df, break_flag=False):
            
    fig = px.scatter(
        df,
        x="Added feature",
        y="MAPE, %",
        error_y='current_metric_stds',
        animation_frame="Number of features",
    )
    
    if break_flag:
        df_es = df_plot[df_plot['Number of features'] == df_plot['Number of features'].iloc[-1]]
        ind = np.argmin(df_es['MAPE, %'])

        fig.add_vline(
            x=df_es['Added feature'].iloc[ind],
            line=dict(
                dash='dash',
                width=4,
                color='darkred',
            ),
            opacity=0.75
        )
    
    fig.update_traces(
        marker=dict(
            color='black'
        )
    )

    fig.update_layout(
        xaxis = dict(
            showline=True,
            ticks='outside',
            mirror=True,
            linecolor='black',
            showgrid=True,
            gridcolor='grey',
            gridwidth=0.25,
        ),

        yaxis = dict(
            showline=True,
            ticks='outside',
            mirror=True,
            linecolor='black',
            tick0=0,
            showgrid=True,
            gridcolor='grey',
            gridwidth=0.25,
            zeroline=True,
            zerolinecolor='black',
            zerolinewidth=0.25
        ),

        font=dict(
            family="Times New Roman",
            size=16,
            color="Black"
        )
    )

    run["plot_results_anim"].upload(File.as_html(fig))

In [8]:
n_feats = data.shape[1] - 5

X_val = data.iloc[size:, :-5]
y_val = data.iloc[size:, -5]
data = data[:size]

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import KFold

if features_selection:
    opt_features = []
    current_metrics = []
    current_metric_stds = []
    raw_mape_scores = []

    features = data.iloc[:, :-5].columns
    features = features.drop(opt_features)
    
    df_plot = pd.DataFrame()

    while True:
        
        mape_scores_list = []
        metrics = []
        metric_stds = []
        
        for feature in tqdm(features, "Features loop"):
            mape_scores = []
            kfold = KFold(n_folds, shuffle=True, random_state=random_state)   
            
            for train, test in tqdm(kfold.split(np.array(data)), "Folds... ", leave=False):        
                X = data.iloc[:, :-5][opt_features+[feature]]
                y = data.iloc[:, -5]

                xgbreg = XGBRegressor(
                    max_depth=max_depth,
                    learning_rate=0.08,
                    n_estimators=10000,
                    random_state=random_state,
                    tree_method='gpu_hist',
                )

                X_train = X.iloc[train, :]
                y_train = y.iloc[train]

                X_test = X.iloc[test, :]
                y_test = y.iloc[test]

                xgbreg.fit(X_train, y_train,
                           verbose=False,
                           eval_set=[(X_val[opt_features+[feature]], y_val)],
                           early_stopping_rounds=early_stopping_rounds)

                y_predict = xgbreg.predict(X_test)
                mape = mean_absolute_percentage_error(y_test, y_predict)
                mape_scores.append(mape)
            
            metric = np.mean(mape_scores)
            metric_std = np.std(mape_scores)
            metrics.append(metric)
            metric_stds.append(metric_std)
            mape_scores_list.append(mape_scores)
        
        best_metric_ind = np.argmin(metrics)
        current_metric = metrics[best_metric_ind]
        current_metrics.append(current_metric)

        current_metric_std = metric_stds[best_metric_ind]
        current_metric_stds.append(current_metric_std)
        
        raw_mape_scores.append(mape_scores_list[best_metric_ind])

        opt_features.append(features[best_metric_ind])
        features = features.drop(features[best_metric_ind])

        print(current_metrics)
        print(current_metric_stds)
        print(opt_features)

        np.savez_compressed('feature_selection/opt_features.npz', a=np.array(opt_features))
        np.savez_compressed('feature_selection/current_metrics.npz', a=np.array(current_metrics))
        np.savez_compressed('feature_selection/current_metric_stds.npz', a=np.array(current_metric_stds))
        np.savez_compressed('feature_selection/raw_mape_scores.npz', a=np.array(raw_mape_scores))

        run['opt_features'] = opt_features
        run['current_metrics'] = current_metrics
        run['current_metric_stds'] = current_metric_stds
        run['raw_mape_scores'] = raw_mape_scores

        df = pd.DataFrame([opt_features]).T
        df.columns = ['Added feature']
        df['MAPE, %'] = current_metrics
        df['current_metric_stds'] = current_metric_stds
        df['Number of features'] = len(opt_features)
        df_plot = df_plot.append(df)
        
        plot_results(df_plot)
        
        cond1 = current_metrics[-1] > np.min(current_metrics)
        cond2 = len(current_metrics)-1 - np.argmin(current_metrics) >= early_stopping_rounds 
        
        if cond1 and cond2:
            opt_features = opt_features[:-early_stopping_rounds]
            current_metrics = current_metrics[:-early_stopping_rounds]
            current_metric_stds = current_metric_stds[:-early_stopping_rounds]
            raw_mape_scores = raw_mape_scores[:-early_stopping_rounds]

            np.savez_compressed('feature_selection/opt_features.npz', a=np.array(opt_features))
            np.savez_compressed('feature_selection/current_metrics.npz', a=np.array(current_metrics))
            np.savez_compressed('feature_selection/current_metric_stds.npz', a=np.array(current_metric_stds))
            np.savez_compressed('feature_selection/raw_mape_scores.npz', a=np.array(raw_mape_scores))

            plot_results(df_plot, break_flag=True)
            break

Features loop:   0%|          | 0/91 [00:00<?, ?it/s]

Folds... : 0it [00:00, ?it/s]

Folds... : 0it [00:00, ?it/s]

Folds... : 0it [00:00, ?it/s]

Folds... : 0it [00:00, ?it/s]

Folds... : 0it [00:00, ?it/s]

Folds... : 0it [00:00, ?it/s]

Folds... : 0it [00:00, ?it/s]

Folds... : 0it [00:00, ?it/s]

Folds... : 0it [00:00, ?it/s]

Folds... : 0it [00:00, ?it/s]

Folds... : 0it [00:00, ?it/s]

Folds... : 0it [00:00, ?it/s]

In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

current_metrics = np.load('feature_selection/current_metrics.npz', allow_pickle=True)['a']
opt_features = np.load('feature_selection/opt_features.npz', allow_pickle=True)['a']
current_metric_stds = np.load('feature_selection/current_metric_stds.npz', allow_pickle=True)['a']
ind = np.argmin(current_metrics)

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=opt_features,
        y=current_metrics,
        error_y=dict(
            type='data',
            width=10,
            array=current_metric_stds
        ),
        marker=dict(
            color='black'
        )
    )
)

fig.add_vline(
    x=opt_features[ind],
    line=dict(
        dash='dash',
        width=4,
        color='darkred',
    ),
    opacity=0.75
)

fig.update_yaxes(title='MAPE, %')
fig.update_xaxes(title='Added feature')

fig.update_layout(

    xaxis = dict(
        showline=True,
        ticks='outside',
        mirror=True,
        linecolor='black',
        showgrid=True,
        gridcolor='grey',
        gridwidth=0.25,
    ),

    yaxis = dict(
        showline=True,
        ticks='outside',
        mirror=True,
        linecolor='black',
        tick0=0,
        showgrid=True,
        gridcolor='grey',
        gridwidth=0.25,
        zeroline=True,
        zerolinecolor='black',
        zerolinewidth=0.25
    ),
    
    font=dict(
        family="Times New Roman",
        size=16,
        color="Black"
    )
)

fig.show()
pio.write_image(fig, 'feature_selection/feature_selection_es_BDT.pdf',
                width=950, height=600)
run["plot_results"].upload(File.as_html(fig))