In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
data_real = pd.read_csv('processed_data/ProcessedTrainReal//ProcessedTrain.csv.gz')
data_real = data_real[data_real['edepR'] < 17.2]

In [3]:
N = int(5e5)

X = data_real.iloc[:, :-5]
y = data_real.iloc[:, -5]

X_train = X[:N]
y_train = y[:N]

X_val = X[N:]
y_val = y[N:]

In [5]:
import pickle
xgbreg = pickle.load(open("models/xgb_energy_real.dat", "rb"))
y_predict = xgbreg.predict(X_val)

In [6]:
from sklearn.metrics import mean_squared_error
all_features_rmse = mean_squared_error(y_val, y_predict)**0.5
all_features_rmse

0.08370266641937904

In [7]:
from xgboost import XGBRegressor

opt_features = []
current_rmses = []
current_rmse = 100

features = X_train.columns
while abs(all_features_rmse - current_rmse) > 0.0002:
    rmses = []
    for feature in tqdm(features, "Features loop"):
        xgbreg = XGBRegressor(
            max_depth=9,
            learning_rate=0.08,
            n_estimators=3000,
            random_state=22,
        )
        
        xgbreg.fit(X_train[opt_features+[feature]], y_train, verbose=False,
                   eval_set=[(X_val[opt_features+[feature]], y_val)],
                   early_stopping_rounds=7)
        
        y_predict = xgbreg.predict(X_val[opt_features+[feature]])
        rmse = mean_squared_error(y_val, y_predict)**0.5
        rmses.append(rmse)
    
    best_rmse_ind = np.argmin(rmses)
    current_rmse = rmses[best_rmse_ind]
    current_rmses.append(current_rmse)
    opt_features.append(features[best_rmse_ind])
    features = features.drop(features[best_rmse_ind])

    print(current_rmses)
    print(opt_features)

Features loop:   0%|          | 0/106 [00:00<?, ?it/s]

[0.29581919183216415]
['AccumCharge']


Features loop:   0%|          | 0/105 [00:00<?, ?it/s]

[0.29581919183216415, 0.12584598058842072]
['AccumCharge', 'R_cht']


Features loop:   0%|          | 0/104 [00:00<?, ?it/s]

[0.29581919183216415, 0.12584598058842072, 0.09795718313210483]
['AccumCharge', 'R_cht', 'z_cc']


Features loop:   0%|          | 0/103 [00:00<?, ?it/s]

[0.29581919183216415, 0.12584598058842072, 0.09795718313210483, 0.08594291184221113]
['AccumCharge', 'R_cht', 'z_cc', 'pe_std']


Features loop:   0%|          | 0/102 [00:00<?, ?it/s]

[0.29581919183216415, 0.12584598058842072, 0.09795718313210483, 0.08594291184221113, 0.08480974405042735]
['AccumCharge', 'R_cht', 'z_cc', 'pe_std', 'nPMTs']


Features loop:   0%|          | 0/101 [00:00<?, ?it/s]

[0.29581919183216415, 0.12584598058842072, 0.09795718313210483, 0.08594291184221113, 0.08480974405042735, 0.08452724565102354]
['AccumCharge', 'R_cht', 'z_cc', 'pe_std', 'nPMTs', 'pe_mean']


Features loop:   0%|          | 0/100 [00:00<?, ?it/s]

[0.29581919183216415, 0.12584598058842072, 0.09795718313210483, 0.08594291184221113, 0.08480974405042735, 0.08452724565102354, 0.08428262336588634]
['AccumCharge', 'R_cht', 'z_cc', 'pe_std', 'nPMTs', 'pe_mean', 'ht_std']


Features loop:   0%|          | 0/99 [00:00<?, ?it/s]

[0.29581919183216415, 0.12584598058842072, 0.09795718313210483, 0.08594291184221113, 0.08480974405042735, 0.08452724565102354, 0.08428262336588634, 0.08396141732193628]
['AccumCharge', 'R_cht', 'z_cc', 'pe_std', 'nPMTs', 'pe_mean', 'ht_std', 'sin_theta_cc']


Features loop:   0%|          | 0/98 [00:00<?, ?it/s]

[0.29581919183216415, 0.12584598058842072, 0.09795718313210483, 0.08594291184221113, 0.08480974405042735, 0.08452724565102354, 0.08428262336588634, 0.08396141732193628, 0.08375710047382079]
['AccumCharge', 'R_cht', 'z_cc', 'pe_std', 'nPMTs', 'pe_mean', 'ht_std', 'sin_theta_cc', 'phi_cc']


In [8]:
np.savez_compressed('feature_selection/opt_features.npz', a=np.array(opt_features))
np.savez_compressed('feature_selection/current_rmses.npz', a=np.array(current_rmses))