
## Feature Selection

In this workshop, we have two datasets: Students2.csv and bank.csv. 

We will implement the following feature selection approaches.
1. F-test
2. SelectTransform
3. RFE




Apply Feature Selection with F-Test on Linear Regression
Compare the result with selected features

Data set: Students2.csv

## Import data

In [1]:
# Import libraries
import pandas as pd

In [2]:
# Read the file
f = pd.read_csv('Students2.csv')

In [3]:
# Split the columns into Dependent (Y) and independent (X) features
x = f.iloc[:,:-1]
y = f.iloc[:, -1]

In [4]:
# Perform Linear Regression using original dataset
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

In [5]:
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.4, random_state = 1234)

lr.fit(X_train, Y_train)

y_predict = lr.predict(X_test)

In [6]:
# Calculate the RMSE error for the regression
from sklearn.metrics import mean_squared_error
import math

rmse = math.sqrt(mean_squared_error(Y_test, y_predict))
rmse

6.982206715357434

### F-test

In [7]:
# import and perform the f_regression to get the F-Score and P-Values
from sklearn.feature_selection import f_regression as fr
result = fr(x,y)

In [8]:
# Split the result tuple into F_Score and P_Values
f_score = result[0]
p_values = result[1]

In [9]:
# Print the table of Features, F-Score and P-values
columns = list(x.columns)

print ("    Features     ", "F-Score    ", "P-Values")
print ("    -----------  ---------    ---------")

for i in range(0, len(columns)):
    f1 = "%4.2f" % f_score[i]
    p1 = "%2.6f" % p_values[i]
    print("    ", columns[i].ljust(12), f1.rjust(8),"    ", p1.rjust(8))

    Features      F-Score     P-Values
    -----------  ---------    ---------
     Hours          141.91      0.000000
     sHours           4.57      0.041403
     hoursplayed      0.00      0.969907
     income           0.16      0.692200
     distance         0.00      0.955528
     calories         0.40      0.530086


In [10]:
# Perform the Linear Regression with reduced features
X_train_n = X_train[['Hours', 'sHours']]
X_test_n = X_test[['Hours', 'sHours']]

lr1 = LinearRegression()
lr1.fit(X_train_n, Y_train)

y_predict_n = lr1.predict(X_test_n)

In [11]:
# Calculate the RMSE with reduced features
rmse_n = math.sqrt(mean_squared_error(Y_test, y_predict_n))

In [12]:
rmse_n

5.09721728108113

### Select Transforms

In [13]:
# Import pandas, read the file and split into X and Y
import pandas as pd
f = pd.read_csv('Students2.csv')
X = f.iloc[:, :-1]
Y = f.iloc[:,  -1]

In [14]:
# Import various select transforms along with the f_regression mode
from sklearn.feature_selection import SelectKBest,             \
                                      SelectPercentile,        \
                                      GenericUnivariateSelect, \
                                      f_regression

In [15]:
# Implement and print SelectKBest
selectorK = SelectKBest(score_func=f_regression, k=3)
x_k = selectorK.fit_transform(X, Y)

In [16]:
# Get f_score and p_values for the selected features
f_score = selectorK.scores_
p_values = selectorK.pvalues_

In [17]:
# Print the f_score and p_values
# Print the table of Features, F-Score and P-values
columns = list(X.columns)

print ("    Features     ", "F-Score    ", "P-Values")
print ("    -----------  ---------    ---------")

for i in range(0, len(columns)):
    f1 = "%4.2f" % f_score[i]
    p1 = "%2.6f" % p_values[i]
    print("    ", columns[i].ljust(12), f1.rjust(8),"    ", p1.rjust(8))

cols = selectorK.get_support(indices=True)
selectedCols = X.columns[cols].to_list()

print(selectedCols)

    Features      F-Score     P-Values
    -----------  ---------    ---------
     Hours          141.91      0.000000
     sHours           4.57      0.041403
     hoursplayed      0.00      0.969907
     income           0.16      0.692200
     distance         0.00      0.955528
     calories         0.40      0.530086
['Hours', 'sHours', 'calories']


In [18]:
# Implement SelectPercentile
selectorP = SelectPercentile(score_func=f_regression, percentile=50)
x_p = selectorP.fit_transform(X, Y)

In [19]:
# Implement GenericUnivariateSelect with k_best
selectorG1 = GenericUnivariateSelect(score_func=f_regression,
                                     mode='k_best',
                                     param=3)
x_g1 = selectorG1.fit_transform(X,Y)

In [20]:
# Implement GenericUnivariateSelect with percentile
selectorG2 = GenericUnivariateSelect(score_func=f_regression,
                                     mode='percentile',
                                     param=50)
x_g2 = selectorG2.fit_transform(X,Y)

## Recursive Feature Elimination

In [21]:
# -----------------------------------------------------------------
# Implement Recursive Feature Elimination.
# Predict product purchase for the Bank Telemarketing dataset
# -----------------------------------------------------------------

# Import libraries
import pandas as pd

In [22]:
# Read the file
f = pd.read_csv('bank.csv')
f = f.drop("duration", axis = 1) # As we create model for market team, this feature is not known
f.shape

(41188, 20)

In [23]:
f.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [24]:
# Split the columns into Dependent (Y) and independent (X) features
x = f.iloc[:,:-1]
y = f.iloc[:, -1]

In [25]:
f.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [26]:
# Create dummy variables
# get k-1 dummies by removing the first level
x = pd.get_dummies(x, drop_first=True)
y = pd.get_dummies(y, drop_first=True)

In [27]:
from sklearn.feature_selection import RFE
# Split the dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.3, random_state = 1234, stratify=y)

In [28]:
# Import Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# Default Random Forest Object
rfc1 = RandomForestClassifier(random_state=1234)
rfc1.fit(X_train, Y_train)
Y_predict1 = rfc1.predict(X_test)

  


In [29]:
# Score and Evaluate the model 
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(Y_test, Y_predict1)
score1 = rfc1.score(X_test, Y_test)

In [30]:
X_test.shape

(12357, 52)

In [31]:
rfc1.feature_importances_

array([1.70809903e-01, 8.39670422e-02, 3.09585398e-02, 1.91089531e-02,
       2.20649131e-02, 2.31893481e-02, 2.78846783e-02, 1.21091042e-01,
       6.26615200e-02, 1.40153409e-02, 6.52409605e-03, 5.32901247e-03,
       1.14962225e-02, 7.62594112e-03, 7.65622686e-03, 1.14585079e-02,
       6.14242701e-03, 1.75824848e-02, 5.61703920e-03, 2.60052665e-03,
       2.11730222e-02, 1.76798976e-02, 9.09749915e-04, 7.64070771e-03,
       1.21102433e-02, 1.73136763e-02, 5.71709699e-04, 1.26410579e-02,
       1.78213433e-02, 7.69009624e-03, 1.66857423e-02, 1.39446286e-07,
       3.31006585e-03, 3.89403727e-02, 3.21863249e-03, 2.38703939e-02,
       1.64336689e-02, 2.42264097e-03, 9.09274796e-04, 2.25768711e-03,
       2.98676679e-03, 3.84770285e-03, 4.23807825e-03, 2.12292725e-03,
       5.45740279e-03, 1.87285593e-03, 1.62182135e-02, 1.61434986e-02,
       1.55667196e-02, 1.59937293e-02, 9.71855647e-03, 2.44496615e-02])

In [32]:
# Apply Recursive Feature Elimination
from sklearn.feature_selection import RFE
# Get an estimator object
rfc2 = RandomForestClassifier(n_estimators = 20)
rfe = RFE(estimator = rfc2, n_features_to_select = 30, step = 1)

rfe.fit(x,y)

  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:,

RFE(estimator=RandomForestClassifier(n_estimators=20), n_features_to_select=30)

In [33]:
# Create new Train and Test datasets
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

In [34]:
type(Y_train)

pandas.core.frame.DataFrame

In [35]:
# Fit the Random Forest classifier to the new train and test with 80 features
# .values will give the values in a numpy array (shape: (n,1))
#.ravel will convert that array shape to (n, ) (i.e. flatten it)

rfc2.fit(X_train_rfe, Y_train.values.ravel())

RandomForestClassifier(n_estimators=20)

In [36]:
# Test the model with new Test dataset
Y_predict = rfc2.predict(X_test_rfe)

In [37]:
# Score and Evaluate the new model 
from sklearn.metrics import confusion_matrix
cm_rfe = confusion_matrix(Y_test, Y_predict)
score_rfe = rfc2.score(X_test_rfe, Y_test)
score_rfe

0.8917212915756252

In [38]:
# Get column names
columns = list(x.columns)

In [39]:
# Get the ranking of the features. Ranking 1 for selected features
ranking = rfe.ranking_
ranking

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3, 10,  1,  1,  5,  1,  6,
        1,  8, 15,  1,  1, 20,  4,  1,  1, 22,  1,  1,  2,  1, 23, 13,  1,
        9,  1,  1, 18, 21, 14, 17, 12,  7, 19, 11, 16,  1,  1,  1,  1,  1,
        1])

In [40]:
# Get the feature importance scores
feature_importance = rfc2.feature_importances_

In [41]:
rfc2.feature_importances_

array([0.19554628, 0.0878563 , 0.03253313, 0.01564362, 0.02860215,
       0.03221751, 0.036948  , 0.14323469, 0.06111904, 0.01645635,
       0.01329755, 0.00830553, 0.01225396, 0.01834334, 0.02139305,
       0.017123  , 0.01493613, 0.01958553, 0.01391324, 0.01897367,
       0.01734367, 0.04255997, 0.02450924, 0.01577495, 0.0171443 ,
       0.01626753, 0.01560075, 0.01516285, 0.00890803, 0.01844669])

In [42]:
# Create the dataframe of the Features selected, Ranking and their importance
rfe_selected = pd.DataFrame()



In [43]:
rfe_selected = pd.concat([pd.DataFrame(columns), 
                          pd.DataFrame(ranking),
                          pd.DataFrame(feature_importance)], axis=1)


In [44]:
rfe_selected

Unnamed: 0,0,0.1,0.2
0,age,1,0.195546
1,campaign,1,0.087856
2,pdays,1,0.032533
3,previous,1,0.015644
4,emp.var.rate,1,0.028602
5,cons.price.idx,1,0.032218
6,cons.conf.idx,1,0.036948
7,euribor3m,1,0.143235
8,nr.employed,1,0.061119
9,job_blue-collar,1,0.016456


In [45]:
rfe_selected.columns = ["Feature Name", "Ranking", "Feature Importance"]


In [46]:
rfe_selected.head()

Unnamed: 0,Feature Name,Ranking,Feature Importance
0,age,1,0.195546
1,campaign,1,0.087856
2,pdays,1,0.032533
3,previous,1,0.015644
4,emp.var.rate,1,0.028602


In [47]:
# Display selected features with Ranking = 1    
rfe_selected[(rfe_selected.Ranking == 1)]['Feature Name']

0                               age
1                          campaign
2                             pdays
3                          previous
4                      emp.var.rate
5                    cons.price.idx
6                     cons.conf.idx
7                         euribor3m
8                       nr.employed
9                   job_blue-collar
12                   job_management
13                      job_retired
15                     job_services
17                   job_technician
20                  marital_married
21                   marital_single
24               education_basic.9y
25            education_high.school
27    education_professional.course
28      education_university.degree
30                  default_unknown
33                      housing_yes
35                         loan_yes
36                contact_telephone
46                  day_of_week_mon
47                  day_of_week_thu
48                  day_of_week_tue
49                  day_of_w

Have a nice day!