In [1]:
# Recursive Feature Elimination
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [2]:
# load the dataset
df = pd.read_csv('../csv_files/evalcodep1.csv', index_col=0)
df.head()

Unnamed: 0,C1,C4,C6,C7,C5p,T4p,T3p,T5p,S1p,S2p,S3p,C2,C3p,Industry_Bins,Y1,Y2
0,0.458312,0.630418,0.673869,0.207673,0.570284,0.441786,0.427704,0.428334,0.428161,0.397992,0.379065,1.0,0,1,0,1
1,1.0,0.384735,0.290706,0.147494,0.698942,0.444796,0.426904,0.424499,0.400971,0.414195,0.378486,0.0,1,1,1,0
2,0.327366,0.582011,0.449107,0.078383,0.555596,0.439275,0.427741,0.437681,0.493771,0.363924,0.384058,1.0,1,1,1,0
3,0.814322,0.578004,0.423274,0.084319,0.553374,0.37169,0.428343,0.437523,0.479331,0.375764,0.401495,1.0,1,1,1,1
4,0.286445,0.259982,0.590068,0.729255,0.566377,0.404918,0.426088,0.420497,0.422876,0.405605,0.382601,1.0,0,3,0,1


In [3]:
df = df.drop(labels='C2', axis=1)
df.head()

Unnamed: 0,C1,C4,C6,C7,C5p,T4p,T3p,T5p,S1p,S2p,S3p,C3p,Industry_Bins,Y1,Y2
0,0.458312,0.630418,0.673869,0.207673,0.570284,0.441786,0.427704,0.428334,0.428161,0.397992,0.379065,0,1,0,1
1,1.0,0.384735,0.290706,0.147494,0.698942,0.444796,0.426904,0.424499,0.400971,0.414195,0.378486,1,1,1,0
2,0.327366,0.582011,0.449107,0.078383,0.555596,0.439275,0.427741,0.437681,0.493771,0.363924,0.384058,1,1,1,0
3,0.814322,0.578004,0.423274,0.084319,0.553374,0.37169,0.428343,0.437523,0.479331,0.375764,0.401495,1,1,1,1
4,0.286445,0.259982,0.590068,0.729255,0.566377,0.404918,0.426088,0.420497,0.422876,0.405605,0.382601,0,3,0,1


In [4]:
df.dtypes

C1               float64
C4               float64
C6               float64
C7               float64
C5p              float64
T4p              float64
T3p              float64
T5p              float64
S1p              float64
S2p              float64
S3p              float64
C3p                int64
Industry_Bins      int64
Y1                 int64
Y2                 int64
dtype: object

In [5]:
array = df.values
X = array[:,0:12] #features
Y = array[:,14] #target

In [6]:
# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(X, Y)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False  True False False  True False False False False  True False False]
[ 6  1  5  3  1  7  4 10  2  1  9  8]


In [7]:
for i in range(1, len(X[0])+1):
    print(i)
    # create a base classifier used to evaluate a subset of attributes
    model = LogisticRegression()
    # create the RFE model and select 3 attributes
    rfe = RFE(model, i)
    rfe = rfe.fit(X, Y)
    # summarize the selection of the attributes
    print('Model with the best', i, 'features')
    print(rfe.support_)
    print(rfe.ranking_)

1
Model with the best 1 features
[False False False False False False False False False  True False False]
[ 8  2  7  5  3  9  6 12  4  1 11 10]
2
Model with the best 2 features
[False  True False False False False False False False  True False False]
[ 7  1  6  4  2  8  5 11  3  1 10  9]
3
Model with the best 3 features
[False  True False False  True False False False False  True False False]
[ 6  1  5  3  1  7  4 10  2  1  9  8]
4
Model with the best 4 features
[False  True False False  True False False False  True  True False False]
[5 1 4 2 1 6 3 9 1 1 8 7]
5
Model with the best 5 features
[False  True False  True  True False False False  True  True False False]
[4 1 3 1 1 5 2 8 1 1 7 6]
6
Model with the best 6 features
[False  True False  True  True False  True False  True  True False False]
[3 1 2 1 1 4 1 7 1 1 6 5]
7
Model with the best 7 features
[False  True  True  True  True False  True False  True  True False False]
[2 1 1 1 1 3 1 6 1 1 5 4]
8
Model with the best 8 features
