
## Feature Selection

In this workshop, we have two datasets: Students2.csv and bank.csv. 

We will implement the following feature selection approaches.
1. F-test
2. SelectTransform
3. RFE




Apply Feature Selection with F-Test on Linear Regression
Compare the result with selected features

Data set: Students2.csv

## Import data

In [1]:
# Import libraries
import pandas as pd

In [2]:
# Read the file
f = pd.read_csv('Students2.csv')

In [3]:
# Split the columns into Dependent (Y) and independent (X) features
x = f.iloc[:,:-1]
y = f.iloc[:, -1]

In [4]:
# Perform Linear Regression using original dataset
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

In [5]:
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.4, random_state = 1234)

lr.fit(X_train, Y_train)

y_predict = lr.predict(X_test)

In [6]:
# Calculate the RMSE error for the regression
from sklearn.metrics import mean_squared_error
import math

rmse = math.sqrt(mean_squared_error(Y_test, y_predict))
rmse

6.982206715357434

### F-test

In [7]:
# import and perform the f_regression to get the F-Score and P-Values
from sklearn.feature_selection import f_regression as fr
result = fr(x,y)

In [8]:
# Split the result tuple into F_Score and P_Values
f_score = result[0]
p_values = result[1]

In [9]:
# Print the table of Features, F-Score and P-values
columns = list(x.columns)

print ("    Features     ", "F-Score    ", "P-Values")
print ("    -----------  ---------    ---------")

for i in range(0, len(columns)):
    f1 = "%4.2f" % f_score[i]
    p1 = "%2.6f" % p_values[i]
    print("    ", columns[i].ljust(12), f1.rjust(8),"    ", p1.rjust(8))

    Features      F-Score     P-Values
    -----------  ---------    ---------
     Hours          141.91      0.000000
     sHours           4.57      0.041403
     hoursplayed      0.00      0.969907
     income           0.16      0.692200
     distance         0.00      0.955528
     calories         0.40      0.530086


In [10]:
# Perform the Linear Regression with reduced features
X_train_n = X_train[['Hours', 'sHours']]
X_test_n = X_test[['Hours', 'sHours']]

lr1 = LinearRegression()
lr1.fit(X_train_n, Y_train)

y_predict_n = lr1.predict(X_test_n)

In [11]:
# Calculate the RMSE with reduced features
rmse_n = math.sqrt(mean_squared_error(Y_test, y_predict_n))

In [12]:
rmse_n

5.09721728108113

### Select Transforms

In [13]:
# Import pandas, read the file and split into X and Y
import pandas as pd
f = pd.read_csv('Students2.csv')
X = f.iloc[:, :-1]
Y = f.iloc[:,  -1]

In [14]:
# Import various select transforms along with the f_regression mode
from sklearn.feature_selection import SelectKBest,             \
                                      SelectPercentile,        \
                                      GenericUnivariateSelect, \
                                      f_regression

In [15]:
# Implement and print SelectKBest
selectorK = SelectKBest(score_func=f_regression, k=3)
x_k = selectorK.fit_transform(X, Y)

In [16]:
# Get f_score and p_values for the selected features
f_score = selectorK.scores_
p_values = selectorK.pvalues_

In [17]:
# Print the f_score and p_values
# Print the table of Features, F-Score and P-values
columns = list(X.columns)

print ("    Features     ", "F-Score    ", "P-Values")
print ("    -----------  ---------    ---------")

for i in range(0, len(columns)):
    f1 = "%4.2f" % f_score[i]
    p1 = "%2.6f" % p_values[i]
    print("    ", columns[i].ljust(12), f1.rjust(8),"    ", p1.rjust(8))

cols = selectorK.get_support(indices=True)
selectedCols = X.columns[cols].to_list()

print(selectedCols)

    Features      F-Score     P-Values
    -----------  ---------    ---------
     Hours          141.91      0.000000
     sHours           4.57      0.041403
     hoursplayed      0.00      0.969907
     income           0.16      0.692200
     distance         0.00      0.955528
     calories         0.40      0.530086
['Hours', 'sHours', 'calories']


In [18]:
# Implement SelectPercentile
selectorP = SelectPercentile(score_func=f_regression, percentile=50)
x_p = selectorP.fit_transform(X, Y)

In [19]:
# Implement GenericUnivariateSelect with k_best
selectorG1 = GenericUnivariateSelect(score_func=f_regression,
                                     mode='k_best',
                                     param=3)
x_g1 = selectorG1.fit_transform(X,Y)

In [20]:
# Implement GenericUnivariateSelect with percentile
selectorG2 = GenericUnivariateSelect(score_func=f_regression,
                                     mode='percentile',
                                     param=50)
x_g2 = selectorG2.fit_transform(X,Y)

## Recursive Feature Elimination

In [21]:
# -----------------------------------------------------------------
# Implement Recursive Feature Elimination.
# Predict product purchase for the Bank Telemarketing dataset
# -----------------------------------------------------------------

# Import libraries
import pandas as pd

In [22]:
# Read the file
f = pd.read_csv('bank.csv')
f = f.drop("duration", axis = 1) # As we create model for market team, this feature is not known
f.shape

(41188, 20)

In [26]:
# Split the columns into Dependent (Y) and independent (X) features
x = f.iloc[:,:-1]
y = f.iloc[:, -1]

In [27]:
# Create dummy variables
x = pd.get_dummies(x, drop_first=True)
y = pd.get_dummies(y, drop_first=True)from sklearn.feature_selection import RFE

In [28]:
# Split the dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.3, random_state = 1234, stratify=y)

In [29]:
# Import Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# Default Random Forest Object
rfc1 = RandomForestClassifier(random_state=1234)
rfc1.fit(X_train, Y_train)
Y_predict1 = rfc1.predict(X_test)

  


In [30]:
# Score and Evaluate the model 
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(Y_test, Y_predict1)
score1 = rfc1.score(X_test, Y_test)

In [35]:
# Apply Recursive Feature Elimination
from sklearn.feature_selection import RFE
# Get an estimator object
rfc2 = RandomForestClassifier(random_state=1234)
rfe = RFE(estimator = rfc2, n_features_to_select = 30, step = 1)

rfe.fit(x,y)

  y = column_or_1d(y, warn=True)


RFE(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                     class_weight=None, criterion='gini',
                                     max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=100, n_jobs=None,
                                     oob_score=False, random_state=1234,
                                     verbose=0, warm_start=False),
    n_features_to_select=30, step=1, verbose=0)

In [32]:
# Create new Train and Test datasets
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

In [33]:
# Fit the Random Forest classifier to the new train and test with 80 features
rfc2.fit(X_train_rfe, Y_train)

  


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1234,
                       verbose=0, warm_start=False)

In [34]:
# Test the model with new Test dataset
Y_predict = rfc2.predict(X_test_rfe)

In [32]:
# Score and Evaluate the new model 
from sklearn.metrics import confusion_matrix
cm_rfe = confusion_matrix(Y_test, Y_predict)
score_rfe = rfc2.score(X_test_rfe, Y_test)
score_rfe

0.8943109168892126

In [33]:
# Get column names
columns = list(x.columns)

In [34]:
# Get the ranking of the features. Ranking 1 for selected features
ranking = rfe.ranking_
ranking

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  5, 11,  1,  1,  3,  1,  6,
        1,  8, 16,  1,  1, 20,  4,  1,  1, 22,  1,  1,  2,  1, 23,  7,  1,
       13,  1,  1, 17, 21, 15, 14, 10, 12, 19,  9, 18,  1,  1,  1,  1,  1,
        1])

In [35]:
# Get the feature importance scores
feature_importance = rfc1.feature_importances_

In [36]:
# Create the dataframe of the Features selected, Ranking and their importance
rfe_selected = pd.DataFrame()



In [37]:
rfe_selected = pd.concat([pd.DataFrame(columns), 
                          pd.DataFrame(ranking),
                          pd.DataFrame(feature_importance)], axis=1)


In [38]:
rfe_selected.head()

Unnamed: 0,0,0.1,0.2
0,age,1,0.17081
1,campaign,1,0.083967
2,pdays,1,0.030959
3,previous,1,0.019109
4,emp.var.rate,1,0.022065


In [39]:
rfe_selected.columns = ["Feature Name", "Ranking", "Feature Importance"]


In [40]:
rfe_selected.head()

Unnamed: 0,Feature Name,Ranking,Feature Importance
0,age,1,0.17081
1,campaign,1,0.083967
2,pdays,1,0.030959
3,previous,1,0.019109
4,emp.var.rate,1,0.022065


In [41]:
# Display selected features with Ranking = 1    
rfe_selected[(rfe_selected.Ranking == 1)]['Feature Name']

0                               age
1                          campaign
2                             pdays
3                          previous
4                      emp.var.rate
5                    cons.price.idx
6                     cons.conf.idx
7                         euribor3m
8                       nr.employed
9                   job_blue-collar
12                   job_management
13                      job_retired
15                     job_services
17                   job_technician
20                  marital_married
21                   marital_single
24               education_basic.9y
25            education_high.school
27    education_professional.course
28      education_university.degree
30                  default_unknown
33                      housing_yes
35                         loan_yes
36                contact_telephone
46                  day_of_week_mon
47                  day_of_week_thu
48                  day_of_week_tue
49                  day_of_w