In [8]:
# Recursive Feature Elimination
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [9]:
# load the dataset
df = pd.read_csv('../csv_files/p4feature-selection.csv', index_col=0)
df.head()

Unnamed: 0,C1,C4,C6p,C7,C5p,T4p,T3p,T5p,S1p,S2p,S3p,C2,C3p,Industry_Bins,Y1,Y2
0,0.013297,0.671107,1.282356,-0.561408,0.054797,0.168664,0.17437,0.310576,-0.147291,0.165044,-0.128181,1.0,0,1,0,1
1,1.259277,-0.641901,-0.759933,-0.861441,2.432764,0.229323,0.099791,-0.01641,-1.241195,0.781034,-0.147795,0.0,1,1,1,0
2,-0.4475,0.412402,-0.759933,-1.331894,-0.23239,0.118072,0.177832,1.107408,2.766871,-0.974778,0.044531,1.0,1,1,1,0
3,0.888201,0.39099,-0.759933,-1.282655,-0.276002,-1.244015,0.233984,1.093939,2.091573,-0.603481,0.697712,1.0,1,1,1,1
4,-0.619728,-1.30862,1.027869,0.887747,-0.021399,-0.574359,0.023742,-0.357698,-0.365024,0.448793,-0.006539,1.0,0,3,0,1


In [10]:
df = df.drop(['C6p'], axis=1)
df.head()

Unnamed: 0,C1,C4,C7,C5p,T4p,T3p,T5p,S1p,S2p,S3p,C2,C3p,Industry_Bins,Y1,Y2
0,0.013297,0.671107,-0.561408,0.054797,0.168664,0.17437,0.310576,-0.147291,0.165044,-0.128181,1.0,0,1,0,1
1,1.259277,-0.641901,-0.861441,2.432764,0.229323,0.099791,-0.01641,-1.241195,0.781034,-0.147795,0.0,1,1,1,0
2,-0.4475,0.412402,-1.331894,-0.23239,0.118072,0.177832,1.107408,2.766871,-0.974778,0.044531,1.0,1,1,1,0
3,0.888201,0.39099,-1.282655,-0.276002,-1.244015,0.233984,1.093939,2.091573,-0.603481,0.697712,1.0,1,1,1,1
4,-0.619728,-1.30862,0.887747,-0.021399,-0.574359,0.023742,-0.357698,-0.365024,0.448793,-0.006539,1.0,0,3,0,1


In [11]:
df.dtypes

C1               float64
C4               float64
C7               float64
C5p              float64
T4p              float64
T3p              float64
T5p              float64
S1p              float64
S2p              float64
S3p              float64
C2               float64
C3p                int64
Industry_Bins      int64
Y1                 int64
Y2                 int64
dtype: object

In [12]:
array = df.values
X = array[:,:-2] #features
Y = array[:,-2] #target

In [13]:
# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(X, Y)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False False False False False False False False  True False  True  True
 False]
[ 9  4 10  7  5  3  6  2  1 11  1  1  8]




In [14]:
for i in range(1, len(X[0])+1):
    print(i)
    # create a base classifier used to evaluate a subset of attributes
    model = LogisticRegression()
    # create the RFE model and select 3 attributes
    rfe = RFE(model, i)
    rfe = rfe.fit(X, Y)
    # summarize the selection of the attributes
    print('Model with the best', i, 'features')
    print(rfe.support_)
    print(rfe.ranking_)

1
Model with the best 1 features
[False False False False False False False False False False False  True
 False]
[11  6 12  9  7  5  8  4  3 13  2  1 10]
2
Model with the best 2 features
[False False False False False False False False False False  True  True
 False]
[10  5 11  8  6  4  7  3  2 12  1  1  9]
3
Model with the best 3 features
[False False False False False False False False  True False  True  True
 False]
[ 9  4 10  7  5  3  6  2  1 11  1  1  8]
4
Model with the best 4 features
[False False False False False False False  True  True False  True  True
 False]
[ 8  3  9  6  4  2  5  1  1 10  1  1  7]
5
Model with the best 5 features
[False False False False False  True False  True  True False  True  True
 False]
[7 2 8 5 3 1 4 1 1 9 1 1 6]
6
Model with the best 6 features
[False  True False False False  True False  True  True False  True  True
 False]
[6 1 7 4 2 1 3 1 1 8 1 1 5]
7
Model with the best 7 features
[False  True False False  True  True False  True  True False  T



Model with the best 9 features
[False  True False  True  True  True  True  True  True False  True  True
 False]
[3 1 4 1 1 1 1 1 1 5 1 1 2]
10
Model with the best 10 features
[False  True False  True  True  True  True  True  True False  True  True
  True]
[2 1 3 1 1 1 1 1 1 4 1 1 1]
11
Model with the best 11 features
[ True  True False  True  True  True  True  True  True False  True  True
  True]
[1 1 2 1 1 1 1 1 1 3 1 1 1]
12
Model with the best 12 features
[ True  True  True  True  True  True  True  True  True False  True  True
  True]
[1 1 1 1 1 1 1 1 1 2 1 1 1]
13
Model with the best 13 features
[ True  True  True  True  True  True  True  True  True  True  True  True
  True]
[1 1 1 1 1 1 1 1 1 1 1 1 1]


