In [1]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
path='/mnt/cephfs/ml_data/mc_2021/'

data_real = pd.read_csv('{}processed_data/ProcessedTrainReal/ProcessedTrain_1M.csv.gz'.format(path))
data_real = data_real[data_real['edepR'] < 17.2]

In [3]:
# size = int(8e5)
n_feats = len(data_real.columns) - 5

X = data_real.iloc[:, :-5]#[:size]
y = data_real.iloc[:, -5]#[:size]

In [None]:
model = XGBRegressor(
    max_depth=9,
    learning_rate=0.08,
    n_estimators=300,
    random_state=22,
    tree_method='gpu_hist'
)

sfs = SFS(model,
          k_features=15,
          forward=True,
          floating=False,
          scoring='neg_mean_absolute_percentage_error',
          cv=5,
          verbose=10,
          n_jobs=-1
)

sfs = sfs.fit(X, y)

fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')
plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.


In [None]:
plt.savefig('feature_selection/SFS_XGB.png')

In [None]:
results = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
results.to_csv('feature_selection/SFS_XGB_mlxtend_output.csv', index=False)

In [None]:
results['feature_names'].iloc[-1]