# Data Features Selection
This notebook is the base for features  selection
## Used libraries

In [295]:
import seaborn as sns
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFE, RFECV, SelectKBest, SelectFromModel, SelectPercentile, f_classif, chi2, mutual_info_classif
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

## Loading data

The train and test inputs are composed of 46 features.

The target of this challenge is `RET` and corresponds to the fact that the **return is in the top 50% of highest stock returns**.

Since the median is very close to 0, this information should not change much with the idea to predict the sign of the return.

In [None]:
train = pd.read_csv('../train_extended.csv', index_col='ID')
test = pd.read_csv('../test_extended.csv', index_col='ID')
train.head()

## Feature selection

To reduce the number of feature (and the noise) we only consider the 5 last days of `RET` and `VOLUME` in addition to the newly created feature.

In [None]:
target = 'RET'
n_shifts = 5  # If you don't want all the shifts to reduce noise
features = ['RET_%d' % (i + 1) for i in range(n_shifts)]
features += ['VOLUME_%d' % (i + 1) for i in range(n_shifts)]
#features += cat_features  # The categorical features if we want to use them
train[features].head()

In [None]:
X,y = train[features], train[target]

# Feature selection through prefit model and SelectFromModel
model = RandomForestClassifier(n_estimators=500, max_depth=8,  n_jobs=-1, verbose=1)
model.fit(X, y)

# feature importance of the model
importances = model.feature_importances_

# plot the feature importances sorted
indices = np.argsort(importances)[::-1]
sns.barplot(x=importances[indices], y=X.columns[indices], orient='h')

In [None]:
# Select 40 features with SelectFromModel with the prefit model
selector = SelectFromModel(model, max_features=40, prefit=True)
selectedFeatures = X.columns[selector.get_support()]
selectedFeatures