# Feature Selection

 - There are several benefits of feature selection
 - Some are given below:
     - Reduces overfitting: Less redundant data means less opportunity to make decisions based on noise.
     - Improves Accuracy: Less misleading data means modeling accuracy improves.
     - Reduces Training Time: Less data means that algorithms train faster.
 - In this notebook, we discuss several feature selection algorithms
     - Algorithm 1: Dropping Constant Features using Variance Threshold Technique
     - Algorithm 2: Feature Selection using Pearson's correlation
     - Algorithm 3: Feature Selection using Information Gain
     - Algorithm 4: Feature Selection using RFECV (Recursive Feature Elimination with Cross Validation)

# Algorithm 1 - Dropping Constant Features using Variance Threshold Technique

In [2]:
import pandas as pd
# Make a dataframe for the following data

data = pd.DataFrame({"A":[1,2,4,1,2,4],
                     "B":[4,5,6,7,8,9],
                     "C":[0,0,0,0,0,0],
                     "D":[1,1,1,1,1,1]}
                    )
data

Unnamed: 0,A,B,C,D
0,1,4,0,1
1,2,5,0,1
2,4,6,0,1
3,1,7,0,1
4,2,8,0,1
5,4,9,0,1


# Variance Threshold:

 - https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html
 - class sklearn.feature_selection.VarianceThreshold(threshold=0.0)
 - Feature selector that removes all low-variance features.
 - This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning.

In [3]:
from sklearn.feature_selection import VarianceThreshold
# By default, the threshold = 0 i.e. it will remove the zero-variance threshold
var_thresh = VarianceThreshold() # by default the threshold = 0
var_thresh.fit(data)

VarianceThreshold()

In [4]:
var_thresh.get_support()

array([ True,  True, False, False])

In [5]:
print("All features: ", data.columns)
print("Features Selected: ", data.columns[var_thresh.get_support()])

All features:  Index(['A', 'B', 'C', 'D'], dtype='object')
Features Selected:  Index(['A', 'B'], dtype='object')


In [6]:
# the following code drops the columns based on the variance threshold algorithm
selected_columns = data.columns[var_thresh.get_support()]
for cols in data.columns:
    if cols not in selected_columns:
        data.drop(columns = cols, inplace = True)

In [7]:
data

Unnamed: 0,A,B
0,1,4
1,2,5
2,4,6
3,1,7
4,2,8
5,4,9


# Algorithm 2 - Feature Selection with Pearson's correlation

# Idea

 - Highly correlated features with the target variable are important features 
 - High correlation between features, (say over 90% or over 80%) indicate the existence of duplicate features.
 - In case of duplicate features, we do not need to take all the features but one one of them would suffice

In [None]:
## Let's load the Absenteeism dataset

df = pd.read_excel('absenteeism.xls')
print(df.shape)
df.head()

In [None]:
df.dropna(inplace = True)
print(df.shape)

In [None]:
df.columns

In [None]:
# Separate the independent (X) and dependent (y) features
y = df['Absenteeism time in hours']
X = df.drop(columns = 'Absenteeism time in hours')


In [None]:
X.head()

In [None]:
y.head()

In [None]:
# Let's first drop the feature id as it is a personal identifier
# Let's split the data into train and test set
# Note that correlation will be done only on the training dataset
from sklearn.model_selection import train_test_split
X.drop(columns = "ID", inplace = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)


In [None]:
# let's import the required libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Let's find the feature correlations

corr = X_train.corr()
corr

# Note that the correlation values lie between -1 and +1
# A correlation value close to -1 indicates a strong negative linear correlation
# A correlation value close to +1 indicates a strong positive linear correlation
# A correlation value close to 0 indicates no linear correlation

In [None]:
# A good way to visualize correlation is using a heatmap
plt.figure(figsize = (12,10))
sns.heatmap(corr, annot = True, cmap = "seismic");
# A link to choose different cmaps: https://matplotlib.org/stable/tutorials/colors/colormaps.html

In [None]:
# use of the mask 
import numpy as np
mask1 = np.triu(np.ones_like(corr, dtype=bool))
mask2 = np.tril(np.ones_like(corr, dtype=bool))
plt.figure(figsize = (12,10))
sns.heatmap(corr, annot = True, cmap = "seismic", mask = mask2);

In [None]:
# with the following function we can select highly correlated features
# it will remove the first feature that is highly correlated with another feature

def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of the redundant columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if(abs(corr_matrix.iloc[i, j])) > threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(X_train, 0.9) # 85% is a good value of the threshold
print(len(corr_features))
print(corr_features)

# Algorithm 3 - Feature Selection using Information Gain

In [None]:
# Let's read a dataset [PIMA Indians Diabetes Dataset]
df = pd.read_csv('diabetes.csv')
print(df.shape)
df.head()

In [None]:
df = df.dropna()
print("Shape: ", df.shape)
df["Outcome"].value_counts()

In [None]:
# Let's separate the independent (X) and dependent (y) variables
y = df['Outcome']
X = df.drop(columns = 'Outcome')
X.head()

In [None]:
# Split the dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 )

In [None]:
# URL: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html
# Estimate mutual information for a discrete target variable.
# Mutual information (MI) between two random variables is a non-negative value, which measures the dependency between the variables. 
# It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.
from sklearn.feature_selection import mutual_info_classif

mutual_info = mutual_info_classif(X_train, y_train, random_state = 20)
mutual_info


In [None]:
print(len(mutual_info))
print(X_train.shape)

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending = False)

In [None]:
plt.figure(figsize = (20, 8))
mutual_info.sort_values(ascending = False).plot.bar();

# Algorithm 4 - Feature Selection using RFECV (Recursive Feature Elimination with Cross Validation)

 - URL: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
 - URL: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
 - Feature ranking with recursive feature elimination.
 

In [None]:
X_train

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

rfc = RandomForestClassifier(random_state=101)
rfecv = RFECV(estimator=rfc, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train, y_train)

In [None]:
print('Optimal number of features: {}'.format(rfecv.n_features_))

In [None]:
plt.figure(figsize=(16, 9))
plt.title('Recursive Feature Elimination with Cross-Validation', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
plt.ylabel('% Correct Classification', fontsize=14, labelpad=20)
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, color='#303F9F', linewidth=3)
plt.show()

In [None]:
print(np.where(rfecv.support_ == False)[0])

In [None]:
selected_features = X_train.drop(X_train.columns[np.where(rfecv.support_ == False)[0]], axis=1)

In [None]:
rfecv.estimator_.feature_importances_

In [None]:
dset = pd.DataFrame()
dset['attr'] = selected_features.columns
dset['importance'] = rfecv.estimator_.feature_importances_
dset = dset.sort_values(by='importance', ascending=False)

plt.figure(figsize=(16, 10))
plt.barh(y=dset['attr'], width=dset['importance'], color='#1976D2')
plt.title('RFECV - Feature importances', fontsize=20, fontweight='bold', pad=20)
plt.xlabel('Importance', fontsize=14, labelpad=20)
plt.show()


In [None]:
dset