In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
vm = input()
if vm.lower()=='yes':
    vm=True
else:
    vm=False

yes


In [3]:
train_mode = input()
if train_mode.lower()=='yes':
    train_mode=True
else:
    train_mode=False

no


In [4]:
if vm:
    path='/mnt/cephfs/ml_data/mc_2021/'
else:
    path=''
    
data_real = pd.read_csv('{}processed_data/ProcessedTrainReal/ProcessedTrain_1M.csv.gz'.format(path))
data_real = data_real[data_real['edepR'] < 17.2]

In [5]:
from sklearn.metrics import mean_squared_error

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [6]:
size = int(1e6)
n_feats = len(data_real.columns) - 5

X_val = data_real.iloc[:, :-5][size:]
y_val = data_real.iloc[:, -5][size:]
data_real = data_real[:size]

In [7]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.model_selection import KFold

if train_mode:    
    n_folds = 5
    rmse_scores = []
    mape_scores = []
    
    kfold = KFold(n_folds, True, random_state=22)    
    for train, test in tqdm(kfold.split(data_real), "Folds... ", leave=False):        
        xgbreg = XGBRegressor(
                max_depth=9,
                learning_rate=0.08,
                n_estimators=3000,
        )
                            
        X_train = np.array(data_real)[train][:, :n_feats]
        y_train = np.array(data_real)[train][:, n_feats]
                            
        X_test = np.array(data_real)[test][:, :n_feats]
        y_test = np.array(data_real)[test][:, n_feats]

        xgbreg.fit(X_train, y_train,
                   verbose=True,
                   eval_set=[(np.array(X_val), np.array(y_val))],
                   early_stopping_rounds=5)
        
        y_predict = xgbreg.predict(X_test)
        rmse = mean_squared_error(y_predict, y_test)**0.5
        mape = mean_absolute_percentage_error(y_predict, y_test)
        rmse_scores.append(rmse)
        mape_scores.append(mape)
    
    result = np.array([[np.mean(mape_scores), np.std(mape_scores)], [np.mean(rmse_scores), np.std(rmse_scores)]])
    np.savez_compressed('feature_selection/all_features_metrics.npz', a=result)

In [8]:
all_features_metrics = np.load('feature_selection/all_features_metrics.npz', allow_pickle=True)['a']
all_features_metrics

array([[1.17148284e+00, 1.04254392e-03],
       [8.24929407e-02, 2.95541135e-04]])

In [9]:
all_features_metric = np.load('feature_selection/all_features_metrics.npz', allow_pickle=True)['a'][0][0]
eps = np.load('feature_selection/all_features_metrics.npz', allow_pickle=True)['a'][0][1]

opt_features = ['AccumCharge', 'R_cht', 'jacob_cc', 'pe_std', 'nPMTs']#[]
current_metrics = [3.8076375114186476, 1.6799966127524213, 1.381430894229591, 1.2331565442044647, 1.2229199895455005]#[]
current_metric_stds = [0.004450264146659087, 0.0022321956920953352, 0.0025158649260755114, 0.0019016478876169257, 0.0022581303993379166]#[]
current_metric = 100

features = data_real.iloc[:, :-5].columns
features = features.drop(opt_features)
while abs(all_features_metric - current_metric) > eps:
    metrics = []
    metric_stds = []
    for feature in tqdm(features, "Features loop"):
        
        X = data_real.iloc[:, :-5][opt_features+[feature]]
        y = data_real.iloc[:, -5]
        
        xgbreg = XGBRegressor(
            max_depth=9,
            learning_rate=0.08,
            n_estimators=3000,
            random_state=22,
            nthreads=4
        )
        
        scores = cross_val_score(
            xgbreg,
            X,
            y,
            cv=5,
            n_jobs=5,
            verbose=False,
            fit_params={
                'eval_set': [(X_val[opt_features+[feature]], y_val)],
                'early_stopping_rounds':5
            },
            scoring='neg_mean_absolute_percentage_error'
        )
        
        metric = -100*scores.mean()
        metric_std = (100*scores).std()
        metrics.append(metric)
        metric_stds.append(metric_std)
        
        print(metrics)
        print(metric_stds)
        print(feature)
    
    best_metric_ind = np.argmin(metrics)
    current_metric = metrics[best_metric_ind]
    current_metrics.append(current_metric)

    current_metric_std = metric_stds[best_metric_ind]
    current_metric_stds.append(current_metric_std)
    
    opt_features.append(features[best_metric_ind])
    features = features.drop(features[best_metric_ind])

    print(current_metrics)
    print(current_metric_stds)
    print(opt_features)
    
    np.savez_compressed('feature_selection/opt_features.npz', a=np.array(opt_features))
    np.savez_compressed('feature_selection/current_metrics.npz', a=np.array(current_metrics))
    np.savez_compressed('feature_selection/current_metric_stds.npz', a=np.array(current_metric_stds))

Features loop:   1%|          | 1/101 [11:18<18:50:52, 678.53s/it]

[1.2102288938739276]
[0.002170871162578412]
R_cc


Features loop:   2%|▏         | 2/101 [21:03<17:08:45, 623.49s/it]

[1.2102288938739276, 1.2170602315636012]
[0.002170871162578412, 0.0022425007421278498]
pho_cc


Features loop:   3%|▎         | 3/101 [30:45<16:27:12, 604.42s/it]

[1.2102288938739276, 1.2170602315636012, 1.2225441410893763]
[0.002170871162578412, 0.0022425007421278498, 0.0020146989987892133]
x_cc


Features loop:   4%|▍         | 4/101 [39:55<15:42:52, 583.22s/it]

[1.2102288938739276, 1.2170602315636012, 1.2225441410893763, 1.2228646739414382]
[0.002170871162578412, 0.0022425007421278498, 0.0020146989987892133, 0.0013890068110080445]
y_cc


Features loop:   5%|▍         | 5/101 [53:34<17:49:05, 668.18s/it]

[1.2102288938739276, 1.2170602315636012, 1.2225441410893763, 1.2228646739414382, 1.2134191433575428]
[0.002170871162578412, 0.0022425007421278498, 0.0020146989987892133, 0.0013890068110080445, 0.0018577130238127313]
z_cc


Features loop:   6%|▌         | 6/101 [1:05:55<18:16:49, 692.73s/it]

[1.2102288938739276, 1.2170602315636012, 1.2225441410893763, 1.2228646739414382, 1.2134191433575428, 1.2148163895748734]
[0.002170871162578412, 0.0022425007421278498, 0.0020146989987892133, 0.0013890068110080445, 0.0018577130238127313, 0.0021692727736462108]
gamma_z_cc


Features loop:   7%|▋         | 7/101 [1:14:52<16:45:53, 642.06s/it]

[1.2102288938739276, 1.2170602315636012, 1.2225441410893763, 1.2228646739414382, 1.2134191433575428, 1.2148163895748734, 1.222648471941091]
[0.002170871162578412, 0.0022425007421278498, 0.0020146989987892133, 0.0013890068110080445, 0.0018577130238127313, 0.0021692727736462108, 0.001687082588018973]
gamma_y_cc


Features loop:   8%|▊         | 8/101 [1:24:35<16:05:59, 623.22s/it]

[1.2102288938739276, 1.2170602315636012, 1.2225441410893763, 1.2228646739414382, 1.2134191433575428, 1.2148163895748734, 1.222648471941091, 1.2225433405238049]
[0.002170871162578412, 0.0022425007421278498, 0.0020146989987892133, 0.0013890068110080445, 0.0018577130238127313, 0.0021692727736462108, 0.001687082588018973, 0.0021552076941865565]
gamma_x_cc


Features loop:   9%|▉         | 9/101 [1:35:57<16:23:42, 641.55s/it]

[1.2102288938739276, 1.2170602315636012, 1.2225441410893763, 1.2228646739414382, 1.2134191433575428, 1.2148163895748734, 1.222648471941091, 1.2225433405238049, 1.2149775915709684]
[0.002170871162578412, 0.0022425007421278498, 0.0020146989987892133, 0.0013890068110080445, 0.0018577130238127313, 0.0021692727736462108, 0.001687082588018973, 0.0021552076941865565, 0.002266620683119449]
theta_cc


Features loop:  10%|▉         | 10/101 [1:44:29<15:12:23, 601.58s/it]

[1.2102288938739276, 1.2170602315636012, 1.2225441410893763, 1.2228646739414382, 1.2134191433575428, 1.2148163895748734, 1.222648471941091, 1.2225433405238049, 1.2149775915709684, 1.2228589857961687]
[0.002170871162578412, 0.0022425007421278498, 0.0020146989987892133, 0.0013890068110080445, 0.0018577130238127313, 0.0021692727736462108, 0.001687082588018973, 0.0021552076941865565, 0.002266620683119449, 0.002003712715634164]
phi_cc


Features loop:  10%|▉         | 10/101 [1:45:09<15:56:52, 630.90s/it]


KeyboardInterrupt: 

In [None]:
np.savez_compressed('feature_selection/opt_features.npz', a=np.array(opt_features))
np.savez_compressed('feature_selection/current_metrics.npz', a=np.array(current_metrics))
np.savez_compressed('feature_selection/current_metric_stds.npz', a=np.array(current_metric_stds))

[3.8076375114186476, 1.6799966127524213, 1.381430894229591, 1.2331565442044647, 1.2229199895455005]

[0.004450264146659087, 0.0022321956920953352, 0.0025158649260755114, 0.0019016478876169257, 0.0022581303993379166]

['AccumCharge', 'R_cht', 'jacob_cc', 'pe_std', 'nPMTs']

[1.2102288938739276, 1.2170602315636012, 1.2225441410893763, 1.2228646739414382, 1.2134191433575428, 1.2148163895748734, 1.222648471941091, 1.2225433405238049, 1.2149775915709684, 1.2228589857961687]

[0.002170871162578412, 0.0022425007421278498, 0.0020146989987892133, 0.0013890068110080445, 0.0018577130238127313, 0.0021692727736462108, 0.001687082588018973, 0.0021552076941865565, 0.002266620683119449, 0.002003712715634164]

phi_cc
