In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('http://web.pdx.edu/~gerbing/data/Boston.csv')

In [3]:
df.shape

(506, 15)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [10]:
df.drop(columns=df.columns[0], inplace=True)
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


Store the predictor variables in data structure X, and target variable in data structure y

In [13]:
y = df['medv']
pred_vars = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad',
'tax', 'ptratio', 'black', 'lstat']
X = df[pred_vars]

View number of predictor variables

In [14]:
len(pred_vars)

13

### Manual Selection

Check uniqueness by looking at the variance inflation factor (VIF).

In [16]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame() # create empty data frame to store the results
vif['Predictor'] = X.columns # extract column names as 'Predictor'

# create variable in the vif df called VIF
vif['VIF'] = [variance_inflation_factor(X.values, i) 
              for i in range(X.shape[1])] # use values to loop through the data matrix for each predictor variable

# calculate the correlation between predictor variables and target variable
cr = df.corr()['medv'].round(3)
# adding the relevance (correlation) of eac predictor to the data frame
vif['Relevance'] = [cr[i]
                    # loop is used to assign values from cr to the relavance column for each predictor
                    for i in range(X.shape[1])] 
vif

Unnamed: 0,Predictor,VIF,Relevance
0,crim,2.100373,-0.388
1,zn,2.844013,0.36
2,indus,14.485758,-0.484
3,chas,1.152952,0.175
4,nox,73.894947,-0.427
5,rm,77.948283,0.695
6,age,21.38685,-0.377
7,dis,14.699652,0.25
8,rad,15.167725,-0.382
9,tax,61.227274,-0.469


### Automated feature selection

Best when there are many predictor variables

#### Automated Univariate Selection

In [17]:
from sklearn.feature_selection import SelectKBest, f_regression
selector = SelectKBest(f_regression, k=5).fit(X,y)
selected = selector.get_support()
selected

array([False, False,  True, False, False,  True, False, False, False,
        True,  True, False,  True])

Display the selected variables by subsetting the original X data structure

In [18]:
X2 = X.iloc[:, selected]
X2.head()

Unnamed: 0,indus,rm,tax,ptratio,lstat
0,2.31,6.575,296,15.3,4.98
1,7.07,6.421,242,17.8,9.14
2,7.07,7.185,242,17.8,4.03
3,2.18,6.998,222,18.7,2.94
4,2.18,7.147,222,18.7,5.33


#### Automated Multivariate Selection

Recursive feature elimination. Identifies the weakest feature on the basis of model fit. Loops through the variables multiple times to prune the weakest feature until the total number of features is obtained

In [20]:
from sklearn.linear_model import LinearRegression
estimator = LinearRegression() # instantiate LinearRegression function
from sklearn.feature_selection import RFE # RFE is recursive feature elimination
selector = RFE(estimator, n_features_to_select=5, step=1).fit(X,y) 

Subset the X data frame to just include the selected feature variables

In [21]:
print(selector.support_)
print(selector.ranking_)

[False False False  True  True  True False  True False False  True False
 False]
[4 6 5 1 1 1 9 1 3 7 1 8 2]


In [22]:
X2 = X.iloc[:, selector.support_]
X2.head()

Unnamed: 0,chas,nox,rm,dis,ptratio
0,0,0.538,6.575,4.09,15.3
1,0,0.469,6.421,4.9671,17.8
2,0,0.469,7.185,4.9671,17.8
3,0,0.458,6.998,6.0622,18.7
4,0,0.458,7.147,6.0622,18.7


View the rankings of all the features that did not make the final cut.

In [23]:
rnk = pd.DataFrame()
rnk['Feature'] = X.columns
rnk['Rank'] = selector.ranking_
rnk.sort_values('Rank').transpose()

Unnamed: 0,3,4,5,7,10,12,8,0,2,1,9,11,6
Feature,chas,nox,rm,dis,ptratio,lstat,rad,crim,indus,zn,tax,black,age
Rank,1,1,1,1,1,2,3,4,5,6,7,8,9
