In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as st
import feature_engine.selection as fes
from sklearn.feature_selection import chi2, RFE, mutual_info_classif, SelectKBest, SelectPercentile
from sklearn.feature_selection import SelectFromModel, mutual_info_regression, f_regression, f_classif, SequentialFeatureSelector
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.linear_model import Lasso, LogisticRegression
pd.set_option('display.max_rows',400)

In [2]:
data = pd.read_csv('santander_train.csv')
data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [3]:
data.isnull().sum()/len(data)

ID                               0.0
var3                             0.0
var15                            0.0
imp_ent_var16_ult1               0.0
imp_op_var39_comer_ult1          0.0
imp_op_var39_comer_ult3          0.0
imp_op_var40_comer_ult1          0.0
imp_op_var40_comer_ult3          0.0
imp_op_var40_efect_ult1          0.0
imp_op_var40_efect_ult3          0.0
imp_op_var40_ult1                0.0
imp_op_var41_comer_ult1          0.0
imp_op_var41_comer_ult3          0.0
imp_op_var41_efect_ult1          0.0
imp_op_var41_efect_ult3          0.0
imp_op_var41_ult1                0.0
imp_op_var39_efect_ult1          0.0
imp_op_var39_efect_ult3          0.0
imp_op_var39_ult1                0.0
imp_sal_var16_ult1               0.0
ind_var1_0                       0.0
ind_var1                         0.0
ind_var2_0                       0.0
ind_var2                         0.0
ind_var5_0                       0.0
ind_var5                         0.0
ind_var6_0                       0.0
i

In [4]:
data.nunique()

ID                               76020
var3                               208
var15                              100
imp_ent_var16_ult1                 596
imp_op_var39_comer_ult1           7551
imp_op_var39_comer_ult3           9099
imp_op_var40_comer_ult1            293
imp_op_var40_comer_ult3            346
imp_op_var40_efect_ult1             23
imp_op_var40_efect_ult3             29
imp_op_var40_ult1                  224
imp_op_var41_comer_ult1           7421
imp_op_var41_comer_ult3           8961
imp_op_var41_efect_ult1            331
imp_op_var41_efect_ult3            454
imp_op_var41_ult1                 8032
imp_op_var39_efect_ult1            336
imp_op_var39_efect_ult3            462
imp_op_var39_ult1                 8149
imp_sal_var16_ult1                  66
ind_var1_0                           2
ind_var1                             2
ind_var2_0                           1
ind_var2                             1
ind_var5_0                           2
ind_var5                 

In [5]:
numerical_columns = []
categorical_columns = []
for column in data.columns:
    if data[column].dtype == float:
        numerical_columns.append(column)
    else:
        categorical_columns.append(column)

In [6]:
X = data.drop('TARGET',axis=1)
y = data['TARGET']

In [7]:
from sklearn.datasets import fetch_california_housing
reg_data = fetch_california_housing()
reg_X = pd.DataFrame(reg_data['data'], columns=reg_data['feature_names'])
reg_y = pd.Series(reg_data['target'])

# Feature Selection

After feature engineering, feature selection involves selecting a set of features that are most significant for predicting target variable. Feature selection reduces the risk of curse of dimensionality, as having too many features may result in a worse performance on machine learning models.

These are the following most common feature selection techniques available:

1. <b>Filter methods</b> : Identify relationship between features and the target variable to compute the importance of features.
- Variance Threshold
- Pearson/Spearman Correlation
- Mutual Information
- Chi-Square (Classification only)
- F-test

2. <b>Wrapper methods</b> : Generate models with a subsets of feature and commpare their model performances.
- Recursive Feature Elimination (Backwards Selection)
- Forward Selection

3. <b>Embedded methods</b> : Using machine learning models to identify feature importance/coefficient importance
- Extra Trees
- Lasso Regression
- Logistic Regression with Lasso penalty (l1)

<i>Note that more details about machine learning algorithms for embedded methods will be covered in Machine Learning section.</i>

Mutual information, chi-square and F-test methods usually require the use of either <b>SelectKBest or SelectPercentile</b> methods during feature selection. Meanwhile, extra trees classifier usually require the use of <b>SelectFromModel</b> methods during feature selection.

Determining the most suitable number of features to select requires trial and error by comparing model performance for different number of features selected.



## Filter Methods

Filter methods involve performing various statistical test to determine most important features to use for model prediction.

Advantages: Fast to compute on larger datasets

Disadvantage: Less accurate compared to other types of methods.

### Variance Threshold

Variance threshold method by default removes features with zero variance.

Features with constant variance can most certainly be eliminated prior to training on machine learning algorithms.

In [8]:
selector = fes.DropConstantFeatures(tol=1)
data_selection = selector.fit_transform(X)
len(data_selection.columns)

336

### Pearson/Spearman Correlation

Independent features that have very strong positive correlation with each other can be removed (one of it) from the dataset to reduce the risk of multicollinearity.

By default, Pearson correlation method is used with a threshold of 0.8.

In [9]:
selector = fes.DropCorrelatedFeatures(method='pearson',threshold=0.8)
data_selection = selector.fit_transform(X)
len(data_selection.columns)

174

### Mutual Information (Classification)

Mutual information between two random variables is a non-negative value, which measures the dependency between the variables. 

It is equal to zero if and only if two random variables are independent, and higher values indicate higher dependency.

The function relies on nonparametric methods based on entropy estimation from k-nearest neighbors distances

Formula for mutual information:

<b>I(X;Y) = H(X) - H(X|Y)</b>

I(X;Y) : Mutual information for X and Y

H(X) : Entropy for X

H(X|Y) : Conditional entropy for X given Y

Note that mutual information captures all types of dependency (both linear and non-linear)

<i>More details about entropy will be covered in decision trees algorithm (Machine Learning) section.</i>

In [10]:
selector = SelectKBest(mutual_info_classif,k=10)
selector.fit(X,y)
columns_selected = X.columns[selector.get_support()]
columns_selected

Index(['ind_var5_0', 'ind_var5', 'ind_var30_0', 'ind_var30', 'num_var4',
       'num_var5', 'num_var30', 'num_var35', 'num_var42',
       'num_meses_var5_ult3'],
      dtype='object')

In [11]:
result = pd.Series(selector.scores_, index= X.columns)
result.sort_values(ascending=False).head(50)

num_var30                       0.018963
ind_var30                       0.018941
num_var5                        0.018711
num_meses_var5_ult3             0.018445
num_var42                       0.018352
ind_var5                        0.016198
ind_var5_0                      0.015918
num_var4                        0.015863
ind_var30_0                     0.015637
num_var35                       0.015484
num_var5_0                      0.014392
ind_var39_0                     0.014050
num_var42_0                     0.013941
saldo_var30                     0.013887
num_var39_0                     0.013855
ind_var41_0                     0.013580
num_var41_0                     0.012263
num_meses_var39_vig_ult3        0.012057
saldo_var42                     0.012000
saldo_medio_var5_hace2          0.011983
var15                           0.011975
num_var30_0                     0.010854
var36                           0.010603
saldo_medio_var5_ult3           0.010365
saldo_var5      

### Chi Square

Chi square method for feature selection is used to identify dependence of features with class target variable. The higher the chi-square test statistic, the greater the importance of the feature for predicting target variable.

In [12]:
X_abs = X.abs()
selector = SelectKBest(score_func=chi2,k=10)
selector.fit(X_abs,y)
columns_selected = X_abs.columns[selector.get_support()]
columns_selected

Index(['delta_imp_aport_var13_1y3', 'delta_imp_aport_var17_1y3',
       'delta_imp_compra_var44_1y3', 'delta_imp_trasp_var33_in_1y3',
       'delta_imp_venta_var44_1y3', 'delta_num_aport_var13_1y3',
       'delta_num_aport_var17_1y3', 'delta_num_compra_var44_1y3',
       'delta_num_trasp_var33_in_1y3', 'delta_num_venta_var44_1y3'],
      dtype='object')

In [13]:
result = pd.Series(selector.scores_, index=X_abs.columns)
result.sort_values(ascending=False).head(10)

delta_imp_aport_var13_1y3       5.309399e+10
delta_num_aport_var13_1y3       5.309399e+10
delta_num_venta_var44_1y3       1.730346e+10
delta_imp_venta_var44_1y3       1.730346e+10
delta_imp_compra_var44_1y3      2.227607e+09
delta_num_compra_var44_1y3      2.227607e+09
delta_imp_trasp_var33_in_1y3    2.059935e+09
delta_num_trasp_var33_in_1y3    2.059935e+09
delta_imp_aport_var17_1y3       1.990651e+09
delta_num_aport_var17_1y3       1.990651e+09
dtype: float64

### F-test (Classification)

F-test method for feature selection is used to compare between different models to check for significance in difference. The higher the F test statistic, the greater the importance of the feature for predicting target variable.

In [14]:
selector = SelectKBest(score_func=f_classif,k=10)
selector.fit(X,y)
columns_selected = X.columns[selector.get_support()]
columns_selected

 190 193 221 223 235 239 245 249 262 263 304 308 316 320 328 350] are constant.
  f = msb / msw


Index(['var15', 'ind_var5', 'ind_var30', 'num_var4', 'num_var5', 'num_var30',
       'num_var35', 'num_var42', 'var36', 'num_meses_var5_ult3'],
      dtype='object')

In [15]:
result = pd.Series(selector.scores_, index=X.columns)
result.sort_values(ascending=False).head(10)

ind_var30              1745.255659
num_meses_var5_ult3    1708.340128
num_var30              1482.100843
num_var42              1425.940751
ind_var5               1418.577470
num_var5               1391.949523
var36                   813.832521
var15                   788.508493
num_var4                492.037964
num_var35               451.888800
dtype: float64

### Mutual Information (Regression)

Mutual information between two random variables is a non-negative value, which measures the dependency between the variables. 

It is equal to zero if and only if two random variables are independent, and higher values indicate higher dependency.

The function relies on nonparametric methods based on entropy estimation from k-nearest neighbors distances

Formula for mutual information:

<b>I(X;Y) = H(X) - H(X|Y)</b>

I(X;Y) : Mutual information for X and Y

H(X) : Entropy for X

H(X|Y) : Conditional entropy for X given Y

Note that mutual information captures all types of dependency (both linear and non-linear)

In [16]:
selector = SelectKBest(mutual_info_regression,k=5)
selector.fit(reg_X,reg_y)
columns_selected = reg_X.columns[selector.get_support()]
columns_selected

Index(['MedInc', 'AveRooms', 'AveOccup', 'Latitude', 'Longitude'], dtype='object')

In [17]:
result = pd.Series(selector.scores_, index= reg_X.columns)
result.sort_values(ascending=False)

Longitude     0.399061
MedInc        0.387696
Latitude      0.369603
AveRooms      0.102759
AveOccup      0.072914
HouseAge      0.031750
AveBedrms     0.024225
Population    0.021267
dtype: float64

### F-test (Regression)

F-test method for feature selection is used to compare between different models to check for significance in difference. The higher the F test statistic, the greater the importance of the feature for predicting target variable.

In [18]:
selector = SelectKBest(score_func=f_regression,k=5)
selector.fit(reg_X,reg_y)
columns_selected = reg_X.columns[selector.get_support()]
columns_selected

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Latitude'], dtype='object')

In [19]:
result = pd.Series(selector.scores_, index=reg_X.columns)
result.sort_values(ascending=False)

MedInc        18556.571631
AveRooms        487.757462
Latitude        438.005453
HouseAge        232.841479
AveBedrms        45.108576
Longitude        43.698976
Population       12.547410
AveOccup         11.635342
dtype: float64

## Wrapper Methods

Wrapper methods evaluate all possible combinations of features and select the combination that produces the best result for specific machine learning algorithms:

Advantage:
- More accurate than filter methods, given all possible combinations of features are considered

Disadvantage: 
- Computationally expensive for very large feature sets
- Features selected may vary between different machine learning algorithms

### Recursive Feature Elimination

The purpose of recursive feature elimination is to select features by recursively considering smaller sets of features.

At first, the estimator is trained on the initial set of features and the importance of each feature is obtained through specific attributes. Then, the least important features are removed from the current set of features and the process is repeated until the desired number of features to select is eventually reached.

In [20]:
selector = ExtraTreesRegressor()
rfe = RFE(selector, n_features_to_select=5)
rfe.fit(reg_X, reg_y)

RFE(estimator=ExtraTreesRegressor(), n_features_to_select=5)

In [21]:
reg_X.columns[rfe.support_]

Index(['MedInc', 'HouseAge', 'AveOccup', 'Latitude', 'Longitude'], dtype='object')

### Sequential Feature Selection (Forwards Selection)

Sequential feature selector provides two option of either forward or backward selection to form a subset of features in a greedy fashion with cross validation.

For forwards selection, the estimator is trained on individual features and the importance of each feature is obtained. Then, the most important feature is selected from the current set of features and the process is repeated for remaining features until the desired number of features is eventually reached.

In [22]:
selector = ExtraTreesRegressor()
sfs = SequentialFeatureSelector(selector, n_features_to_select=5)
rfe.fit(reg_X, reg_y)

RFE(estimator=ExtraTreesRegressor(), n_features_to_select=5)

In [23]:
reg_X.columns[rfe.support_]

Index(['MedInc', 'HouseAge', 'AveOccup', 'Latitude', 'Longitude'], dtype='object')

## Embedded Methods

Embedded methods is a combination of both filter and wrapper methods, where some machine learning algorithms have their own feature selection methods (i.e. feature importance or coefficient importance)

Embedded methods allow machine learning algorithms to be trained and perform feature selection simultaneously.

Advantages:
- Considers interaction of features like wrapper methods
- Faster computation than wrapper methods
- Higher accuracy than filter methods
- Much less prone to overfitting

Disadvantages: Features selected may vary between different machine learning algorithms

### Extra Trees Classifier

For tree-based algorithms like random forest or extra trees, these machine learning algorithms have special attribute that computes the importance of features (feature_importances_).

Seperate models are available for regression and classification problems for tree-based algorithms.

In [24]:
selector = ExtraTreesClassifier().fit(X,y)
result = pd.Series(selector.feature_importances_,index=X.columns)
result.sort_values(ascending=False).head(10)

ID                        0.235080
var38                     0.201304
var15                     0.162464
saldo_medio_var5_ult3     0.023018
saldo_medio_var5_hace3    0.022517
num_var45_ult3            0.020466
num_var45_hace3           0.018390
num_var45_hace2           0.014737
num_var22_ult3            0.012986
num_var45_ult1            0.011662
dtype: float64

In [25]:
model = SelectFromModel(selector, prefit=True, max_features=10, threshold=None)
X_new = model.transform(X)
X_subset = pd.DataFrame(X_new,columns = X.columns[model.get_support()])
X_subset.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      76020 non-null  float64
 1   var15                   76020 non-null  float64
 2   num_var22_ult3          76020 non-null  float64
 3   num_var45_hace2         76020 non-null  float64
 4   num_var45_hace3         76020 non-null  float64
 5   num_var45_ult1          76020 non-null  float64
 6   num_var45_ult3          76020 non-null  float64
 7   saldo_medio_var5_hace3  76020 non-null  float64
 8   saldo_medio_var5_ult3   76020 non-null  float64
 9   var38                   76020 non-null  float64
dtypes: float64(10)
memory usage: 5.8 MB


### Logistic Regression (with l1 penalty)

Logistic regression is a classification algorithm that can be regularized using l1 penalty (Lasso), which shrinks coefficients to zero for less important features as a penalization of using features that contribute less to prediction of class target variables.

In [26]:
classifier = LogisticRegression(penalty='l1', solver='saga', max_iter=10000, n_jobs=-1, verbose=1)
classifier.fit(X,y)
coef_result = pd.Series(np.abs(classifier.coef_[0]),index=X.columns)
coef_result.sort_values(ascending=False)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


convergence after 9856 epochs took 4964 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 82.7min finished


var38                            1.332948e-07
ID                               8.606053e-08
saldo_var30                      1.576185e-08
saldo_var42                      8.587551e-09
saldo_var12                      7.229116e-09
saldo_var13                      7.174295e-09
saldo_var24                      7.114169e-09
saldo_medio_var12_ult1           6.843875e-09
saldo_var13_corto                5.521354e-09
saldo_medio_var13_corto_ult1     5.519201e-09
saldo_medio_var12_ult3           5.278894e-09
saldo_medio_var12_hace2          4.792109e-09
saldo_medio_var13_corto_hace2    4.384659e-09
saldo_medio_var13_corto_ult3     4.316725e-09
imp_aport_var13_hace3            3.438249e-09
imp_trans_var37_ult1             1.867995e-09
var3                             1.825345e-09
saldo_medio_var5_hace2           1.732560e-09
saldo_var13_largo                1.652303e-09
saldo_var5                       1.202252e-09
saldo_medio_var5_ult1            1.151304e-09
saldo_medio_var5_ult3            1

### Extra Trees Regressor

For tree-based algorithms like random forest or extra trees, these machine learning algorithms have special attribute that computes the importance of features (feature_importances_).

Seperate models are available for regression and classification problems for tree-based algorithms.

In [27]:
selector = ExtraTreesRegressor().fit(reg_X,reg_y)
result = pd.Series(selector.feature_importances_,index=reg_X.columns)
result.sort_values(ascending=False).head(10)

MedInc        0.492047
Longitude     0.116366
Latitude      0.112899
AveOccup      0.106732
HouseAge      0.069487
AveRooms      0.042023
AveBedrms     0.033510
Population    0.026936
dtype: float64

In [28]:
model = SelectFromModel(selector, prefit=True, max_features=None, threshold=0.05)
X_new = model.transform(reg_X)
X_subset = pd.DataFrame(X_new,columns = reg_X.columns[model.get_support()])
X_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   MedInc     20640 non-null  float64
 1   HouseAge   20640 non-null  float64
 2   AveOccup   20640 non-null  float64
 3   Latitude   20640 non-null  float64
 4   Longitude  20640 non-null  float64
dtypes: float64(5)
memory usage: 806.4 KB




### Lasso Regression

Lasso regression is a regularized version of linear regression models, which shrinks coefficients to zero for less important features as a penalization of using features that contribute less to prediction of target variables.

Note that for Lasso regression, coefficients of linear regression model needs to be scaled by standard deviation of individual features for consistent unit of measure when identifying feature importance.

More details can be referred to the following link for a complete explanation of coefficient interpretation for linear regression models:

https://scikit-learn.org/stable/auto_examples/feature_selection/plot_select_from_model_diabetes.html#feature-importance-from-coefficients

In [29]:
regressor = Lasso()
regressor.fit(reg_X,reg_y)
coef_result = pd.Series(np.abs(regressor.coef_)*reg_X.std(axis=0),index=reg_X.columns)
coef_result.sort_values(ascending=False)

MedInc        0.276366
HouseAge      0.073185
Population    0.007217
AveRooms      0.000000
AveBedrms     0.000000
AveOccup      0.000000
Latitude      0.000000
Longitude     0.000000
dtype: float64