# FEATURES SELECTION

This notebook provides functions to apply various method of features selection such as:
- Pearson correlation
- Chi-squared
- Recursive Feature Elimination
- Lasso
- Random Forest

## Filter Based

<div class="alert alert-block alert-info">
    
Correlation" measures used should depend on the type of variables being investigated:

- continuous variable v continuous variable: use "traditional" correlation - e.g. Spearman's rank correlation or Pearson's linear correlation.
- continuous variable v categorical variable: use an ANOVA F-test / difference of means
- categorical variable v categorical variable: use Chi-square / Cramer's 
    
</div>

### Pearson correlation

In [None]:
# ONLY FOR CONTINOUS-CONTINOUS VARIABLES
# We want to keep only features not correlated so we check the correlations between a target columns and features columns

def correlation_selector(X, y):
    target = X.loc[:,y]
    test = X.loc[:, X.columns != y]
    cor_list = []
    feature_name = X.columns.tolist()
    
# calculate the correlation with y for each feature
    for i in test.columns.tolist():
        cor = st.pearsonr(X[i], target)[0]
        pvalue = st.pearsonr(X[i], target)[1]
        cor_list.append(cor)
    
    if pvalue <= 0.05:
        if cor >= 0.75 or cor <= -0.75:
            cor_list.append(cor)
            
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-len(X):]].columns.tolist() # get the feature's names
    cor_support = [True if i in cor_feature else False for i in feature_name] # Feature selection
  
    
    return cor_support, cor_feature

In [None]:
# We check the absolute value of the Pearson’s correlation between the target and numerical features in our dataset. 
# We keep the top n features based on this criterion.

def cor_selector(X, y,num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature
cor_support, cor_feature = cor_selector(X, y,num_feats)
print(str(len(cor_feature)), 'selected features')

### Chi-Squared

In [None]:
# we calculate the chi-square metric between the target and the numerical variable.
# We only select the variable with the maximum chi-squared values.

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

## Wrapped based

### Recursive Feature Elimination

<div class="alert alert-block alert-info">

- The goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. 
- First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. 
- Then, the least important features are pruned from current set of features. 
- That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.
    
</div>

In [None]:
# we could use any estimator with the method. In this case, we use LogisticRegression

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

## Embedded based

### Lasso

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), max_features=num_feats)
embeded_lr_selector.fit(X_norm, y)

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

### Random Forest

In [None]:
# We calculate feature importance using node impurities in each decision tree. 
# The final feature importance is the average of all decision tree feature importance.

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats)
embeded_rf_selector.fit(X, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

### All

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler


# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_feats)