In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
vm = input()
if vm.lower()=='yes':
    vm=True
else:
    vm=False

yes


In [3]:
train_mode = input()
if train_mode.lower()=='yes':
    train_mode=True
else:
    train_mode=False

no


In [4]:
if vm:
    path='/mnt/cephfs/ml_data/mc_2021/'
else:
    path=''
    
data_real = pd.read_csv('{}processed_data/ProcessedTrainReal/ProcessedTrain_1M.csv.gz'.format(path))
data_real = data_real[data_real['edepR'] < 17.2]

In [5]:
from sklearn.metrics import mean_squared_error

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [6]:
size = int(1e6)
n_feats = len(data_real.columns) - 5

X_val = data_real.iloc[:, :-5][size:]
y_val = data_real.iloc[:, -5][size:]
data_real = data_real[:size]

In [7]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.model_selection import KFold

if train_mode:    
    n_folds = 5
    rmse_scores = []
    mape_scores = []
    
    kfold = KFold(n_folds, True, random_state=22)    
    for train, test in tqdm(kfold.split(data_real), "Folds... ", leave=False):        
        xgbreg = XGBRegressor(
                max_depth=9,
                learning_rate=0.08,
                n_estimators=3000,
        )
                            
        X_train = np.array(data_real)[train][:, :n_feats]
        y_train = np.array(data_real)[train][:, n_feats]
                            
        X_test = np.array(data_real)[test][:, :n_feats]
        y_test = np.array(data_real)[test][:, n_feats]

        xgbreg.fit(X_train, y_train,
                   verbose=True,
                   eval_set=[(np.array(X_val), np.array(y_val))],
                   early_stopping_rounds=5)
        
        y_predict = xgbreg.predict(X_test)
        rmse = mean_squared_error(y_predict, y_test)**0.5
        mape = mean_absolute_percentage_error(y_predict, y_test)
        rmse_scores.append(rmse)
        mape_scores.append(mape)
    
    result = np.array([[np.mean(mape_scores), np.std(mape_scores)], [np.mean(rmse_scores), np.std(rmse_scores)]])
    np.savez_compressed('feature_selection/all_features_metrics.npz', a=result)

In [8]:
all_features_metrics = np.load('feature_selection/all_features_metrics.npz', allow_pickle=True)['a']
all_features_metrics

array([[1.17148284e+00, 1.04254392e-03],
       [8.24929407e-02, 2.95541135e-04]])

In [None]:
all_features_metric = np.load('feature_selection/all_features_metrics.npz', allow_pickle=True)['a'][0][0]
eps = np.load('feature_selection/all_features_metrics.npz', allow_pickle=True)['a'][0][1]

opt_features = ['AccumCharge', 'R_cht', 'jacob_cc', 'pe_std', 'nPMTs', 'R_cc', 'ht_std']#[]
current_metrics = [3.8076375114186476, 1.6799966127524213, 1.381430894229591, 1.2331565442044647, 1.2229199895455005, 1.2102288938739276, 1.2002026796670915]#[]
current_metric_stds = [0.004450264146659087, 0.0022321956920953352, 0.0025158649260755114, 0.0019016478876169257, 0.0022581303993379166, 0.002170871162578412, 0.0016240114340703806]#[]
current_metric = 100

features = data_real.iloc[:, :-5].columns
features = features.drop(opt_features)
while abs(all_features_metric - current_metric) > eps:
    metrics = []
    metric_stds = []
    for feature in tqdm(features[len(metrics):], "Features loop"):
        
        X = data_real.iloc[:, :-5][opt_features+[feature]]
        y = data_real.iloc[:, -5]
        
        xgbreg = XGBRegressor(
            max_depth=9,
            learning_rate=0.08,
            n_estimators=3000,
            random_state=22,
        )
        
        scores = cross_val_score(
            xgbreg,
            X,
            y,
            cv=5,
            n_jobs=-1,
            verbose=False,
            fit_params={
                'eval_set': [(X_val[opt_features+[feature]], y_val)],
                'early_stopping_rounds':5
            },
            scoring='neg_mean_absolute_percentage_error'
        )
        
        metric = -100*scores.mean()
        metric_std = (100*scores).std()
        metrics.append(metric)
        metric_stds.append(metric_std)
        
        print(metrics)
        print(metric_stds)
        print(feature)
    
    best_metric_ind = np.argmin(metrics)
    current_metric = metrics[best_metric_ind]
    current_metrics.append(current_metric)

    current_metric_std = metric_stds[best_metric_ind]
    current_metric_stds.append(current_metric_std)
    
    opt_features.append(features[best_metric_ind])
    features = features.drop(features[best_metric_ind])

    print(current_metrics)
    print(current_metric_stds)
    print(opt_features)
    
    np.savez_compressed('feature_selection/opt_features.npz', a=np.array(opt_features))
    np.savez_compressed('feature_selection/current_metrics.npz', a=np.array(current_metrics))
    np.savez_compressed('feature_selection/current_metric_stds.npz', a=np.array(current_metric_stds))

Features loop:   1%|          | 1/99 [12:46<20:51:24, 766.17s/it]

[1.200005862078454]
[0.0027734935534794378]
pho_cc


Features loop:   2%|▏         | 2/99 [29:27<24:22:13, 904.47s/it]

[1.200005862078454, 1.2000146802493408]
[0.0027734935534794378, 0.0022500933337003975]
x_cc


Features loop:   3%|▎         | 3/99 [46:59<25:54:49, 971.77s/it]

[1.200005862078454, 1.2000146802493408, 1.19984617175303]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655]
y_cc


Features loop:   4%|▍         | 4/99 [1:02:57<25:30:21, 966.54s/it]

[1.200005862078454, 1.2000146802493408, 1.19984617175303, 1.199521059966139]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655, 0.002537239461298353]
z_cc


Features loop:   5%|▌         | 5/99 [1:19:55<25:43:10, 985.01s/it]

[1.200005862078454, 1.2000146802493408, 1.19984617175303, 1.199521059966139, 1.1988511143085019]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655, 0.002537239461298353, 0.0020221414255399716]
gamma_z_cc


Features loop:   6%|▌         | 6/99 [1:38:02<26:20:42, 1019.81s/it]

[1.200005862078454, 1.2000146802493408, 1.19984617175303, 1.199521059966139, 1.1988511143085019, 1.199482633287603]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655, 0.002537239461298353, 0.0020221414255399716, 0.0020281378455875645]
gamma_y_cc


Features loop:   7%|▋         | 7/99 [1:56:16<26:40:33, 1043.84s/it]

[1.200005862078454, 1.2000146802493408, 1.19984617175303, 1.199521059966139, 1.1988511143085019, 1.199482633287603, 1.1997023044682298]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655, 0.002537239461298353, 0.0020221414255399716, 0.0020281378455875645, 0.002126423004061803]
gamma_x_cc


Features loop:   8%|▊         | 8/99 [2:15:46<27:24:11, 1084.08s/it]

[1.200005862078454, 1.2000146802493408, 1.19984617175303, 1.199521059966139, 1.1988511143085019, 1.199482633287603, 1.1997023044682298, 1.1983052761007111]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655, 0.002537239461298353, 0.0020221414255399716, 0.0020281378455875645, 0.002126423004061803, 0.0022986819650009068]
theta_cc


Features loop:   9%|▉         | 9/99 [2:33:37<26:59:45, 1079.84s/it]

[1.200005862078454, 1.2000146802493408, 1.19984617175303, 1.199521059966139, 1.1988511143085019, 1.199482633287603, 1.1997023044682298, 1.1983052761007111, 1.1993593175990853]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655, 0.002537239461298353, 0.0020221414255399716, 0.0020281378455875645, 0.002126423004061803, 0.0022986819650009068, 0.002398208085844706]
phi_cc


Features loop:  10%|█         | 10/99 [2:50:57<26:23:47, 1067.72s/it]

[1.200005862078454, 1.2000146802493408, 1.19984617175303, 1.199521059966139, 1.1988511143085019, 1.199482633287603, 1.1997023044682298, 1.1983052761007111, 1.1993593175990853, 1.1998967035560315]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655, 0.002537239461298353, 0.0020221414255399716, 0.0020281378455875645, 0.002126423004061803, 0.0022986819650009068, 0.002398208085844706, 0.0021074437207745]
sin_theta_cc


Features loop:  11%|█         | 11/99 [3:10:45<26:59:57, 1104.52s/it]

[1.200005862078454, 1.2000146802493408, 1.19984617175303, 1.199521059966139, 1.1988511143085019, 1.199482633287603, 1.1997023044682298, 1.1983052761007111, 1.1993593175990853, 1.1998967035560315, 1.197910835591998]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655, 0.002537239461298353, 0.0020221414255399716, 0.0020281378455875645, 0.002126423004061803, 0.0022986819650009068, 0.002398208085844706, 0.0021074437207745, 0.0017468101087806036]
cos_theta_cc


Features loop:  12%|█▏        | 12/99 [3:29:43<26:56:24, 1114.76s/it]

[1.200005862078454, 1.2000146802493408, 1.19984617175303, 1.199521059966139, 1.1988511143085019, 1.199482633287603, 1.1997023044682298, 1.1983052761007111, 1.1993593175990853, 1.1998967035560315, 1.197910835591998, 1.199484011520259]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655, 0.002537239461298353, 0.0020221414255399716, 0.0020281378455875645, 0.002126423004061803, 0.0022986819650009068, 0.002398208085844706, 0.0021074437207745, 0.0017468101087806036, 0.002306161297060108]
sin_phi_cc


Features loop:  13%|█▎        | 13/99 [3:45:10<25:16:18, 1057.89s/it]

[1.200005862078454, 1.2000146802493408, 1.19984617175303, 1.199521059966139, 1.1988511143085019, 1.199482633287603, 1.1997023044682298, 1.1983052761007111, 1.1993593175990853, 1.1998967035560315, 1.197910835591998, 1.199484011520259, 1.1995963847725926]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655, 0.002537239461298353, 0.0020221414255399716, 0.0020281378455875645, 0.002126423004061803, 0.0022986819650009068, 0.002398208085844706, 0.0021074437207745, 0.0017468101087806036, 0.002306161297060108, 0.0016862551236887762]
cos_phi_cc


Features loop:  14%|█▍        | 14/99 [3:59:00<23:21:09, 989.05s/it] 

[1.200005862078454, 1.2000146802493408, 1.19984617175303, 1.199521059966139, 1.1988511143085019, 1.199482633287603, 1.1997023044682298, 1.1983052761007111, 1.1993593175990853, 1.1998967035560315, 1.197910835591998, 1.199484011520259, 1.1995963847725926, 1.1996965398245139]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655, 0.002537239461298353, 0.0020221414255399716, 0.0020281378455875645, 0.002126423004061803, 0.0022986819650009068, 0.002398208085844706, 0.0021074437207745, 0.0017468101087806036, 0.002306161297060108, 0.0016862551236887762, 0.0018897535214832818]
pho_cht


Features loop:  15%|█▌        | 15/99 [4:12:15<21:42:45, 930.54s/it]

[1.200005862078454, 1.2000146802493408, 1.19984617175303, 1.199521059966139, 1.1988511143085019, 1.199482633287603, 1.1997023044682298, 1.1983052761007111, 1.1993593175990853, 1.1998967035560315, 1.197910835591998, 1.199484011520259, 1.1995963847725926, 1.1996965398245139, 1.2004480137317408]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655, 0.002537239461298353, 0.0020221414255399716, 0.0020281378455875645, 0.002126423004061803, 0.0022986819650009068, 0.002398208085844706, 0.0021074437207745, 0.0017468101087806036, 0.002306161297060108, 0.0016862551236887762, 0.0018897535214832818, 0.0027675848714792434]
x_cht


Features loop:  16%|█▌        | 16/99 [4:25:01<20:18:29, 880.84s/it]

[1.200005862078454, 1.2000146802493408, 1.19984617175303, 1.199521059966139, 1.1988511143085019, 1.199482633287603, 1.1997023044682298, 1.1983052761007111, 1.1993593175990853, 1.1998967035560315, 1.197910835591998, 1.199484011520259, 1.1995963847725926, 1.1996965398245139, 1.2004480137317408, 1.2009901653846446]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655, 0.002537239461298353, 0.0020221414255399716, 0.0020281378455875645, 0.002126423004061803, 0.0022986819650009068, 0.002398208085844706, 0.0021074437207745, 0.0017468101087806036, 0.002306161297060108, 0.0016862551236887762, 0.0018897535214832818, 0.0027675848714792434, 0.0020457874720896953]
y_cht


Features loop:  17%|█▋        | 17/99 [4:38:18<19:29:19, 855.60s/it]

[1.200005862078454, 1.2000146802493408, 1.19984617175303, 1.199521059966139, 1.1988511143085019, 1.199482633287603, 1.1997023044682298, 1.1983052761007111, 1.1993593175990853, 1.1998967035560315, 1.197910835591998, 1.199484011520259, 1.1995963847725926, 1.1996965398245139, 1.2004480137317408, 1.2009901653846446, 1.1997561980629832]
[0.0027734935534794378, 0.0022500933337003975, 0.0019586406846201655, 0.002537239461298353, 0.0020221414255399716, 0.0020281378455875645, 0.002126423004061803, 0.0022986819650009068, 0.002398208085844706, 0.0021074437207745, 0.0017468101087806036, 0.002306161297060108, 0.0016862551236887762, 0.0018897535214832818, 0.0027675848714792434, 0.0020457874720896953, 0.0014079715919168123]
z_cht


In [None]:
np.savez_compressed('feature_selection/opt_features.npz', a=np.array(opt_features))
np.savez_compressed('feature_selection/current_metrics.npz', a=np.array(current_metrics))
np.savez_compressed('feature_selection/current_metric_stds.npz', a=np.array(current_metric_stds))

opt_features = ['AccumCharge', 'R_cht', 'jacob_cc', 'pe_std', 'nPMTs', 'R_cc', 'ht_std']

current_metrics = [3.8076375114186476, 1.6799966127524213, 1.381430894229591, 1.2331565442044647, 1.2229199895455005, 1.2102288938739276, 1.2002026796670915]

current_metric_stds = [0.004450264146659087, 0.0022321956920953352, 0.0025158649260755114, 0.0019016478876169257, 0.0022581303993379166, 0.002170871162578412, 0.0016240114340703806]