In [1]:
# Recursive Feature Elimination
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [2]:
# load the dataset
df = pd.read_csv('../csv_files/p2feature-selection.csv', index_col=0)
df.head()

Unnamed: 0,C1,C4,C7,C5p,T4p,T3p,T5p,S1p,S2p,S3p,C2,C3p,Industry_Bins,Y1,Y2
0,0.030756,0.669243,-0.477847,0.056127,0.168742,0.174345,0.310826,-0.14697,0.165628,-0.128062,1.0,0,1,0,1
1,1.279824,-0.643726,-0.78931,2.434275,0.229402,0.099767,-0.016168,-1.24088,0.781633,-0.147676,0.0,1,1,1,0
2,-0.431183,0.410546,-1.277684,-0.231083,0.11815,0.177807,1.10768,2.767209,-0.974221,0.044649,1.0,1,1,1,0
3,0.907828,0.389135,-1.226569,-0.274697,-1.243939,0.233959,1.09421,2.091907,-0.602916,0.697829,1.0,1,1,1,1
4,-0.603838,-1.310426,1.026514,-0.020075,-0.574282,0.023718,-0.357465,-0.364704,0.449384,-0.006421,1.0,0,3,0,1


In [3]:
df.dtypes

C1               float64
C4               float64
C7               float64
C5p              float64
T4p              float64
T3p              float64
T5p              float64
S1p              float64
S2p              float64
S3p              float64
C2               float64
C3p                int64
Industry_Bins      int64
Y1                 int64
Y2                 int64
dtype: object

In [4]:
array = df.values
X = array[:,:-2] #features
Y = array[:,-1] #target

In [5]:
# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(X, Y)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False False False False False False False False  True False  True False
  True]
[11  2  7  3  9 10  5  4  1  8  1  6  1]




In [6]:
for i in range(1, len(X[0])+1):
    print(i)
    # create a base classifier used to evaluate a subset of attributes
    model = LogisticRegression()
    # create the RFE model and select 3 attributes
    rfe = RFE(model, i)
    rfe = rfe.fit(X, Y)
    # summarize the selection of the attributes
    print('Model with the best', i, 'features')
    print(rfe.support_)
    print(rfe.ranking_)

1
Model with the best 1 features
[False False False False False False False False False False  True False
 False]
[13  4  9  5 11 12  7  6  3 10  1  8  2]
2
Model with the best 2 features
[False False False False False False False False False False  True False
  True]
[12  3  8  4 10 11  6  5  2  9  1  7  1]
3
Model with the best 3 features
[False False False False False False False False  True False  True False
  True]
[11  2  7  3  9 10  5  4  1  8  1  6  1]
4
Model with the best 4 features
[False  True False False False False False False  True False  True False
  True]
[10  1  6  2  8  9  4  3  1  7  1  5  1]
5
Model with the best 5 features
[False  True False  True False False False False  True False  True False
  True]
[9 1 5 1 7 8 3 2 1 6 1 4 1]
6
Model with the best 6 features
[False  True False  True False False False  True  True False  True False
  True]
[8 1 4 1 6 7 2 1 1 5 1 3 1]
7
Model with the best 7 features
[False  True False  True False False  True  True  True False  T



Model with the best 9 features
[False  True  True  True False False  True  True  True False  True  True
  True]
[5 1 1 1 3 4 1 1 1 2 1 1 1]
10
Model with the best 10 features
[False  True  True  True False False  True  True  True  True  True  True
  True]
[4 1 1 1 2 3 1 1 1 1 1 1 1]
11
Model with the best 11 features
[False  True  True  True  True False  True  True  True  True  True  True
  True]
[3 1 1 1 1 2 1 1 1 1 1 1 1]
12
Model with the best 12 features
[False  True  True  True  True  True  True  True  True  True  True  True
  True]
[2 1 1 1 1 1 1 1 1 1 1 1 1]
13
Model with the best 13 features
[ True  True  True  True  True  True  True  True  True  True  True  True
  True]
[1 1 1 1 1 1 1 1 1 1 1 1 1]


