In [9]:
from sklearn.datasets import load_boston
from sklearn.linear_model import (LinearRegression, Ridge, 
                                  Lasso, RandomizedLasso)
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score, ShuffleSplit
import numpy as np
import pandas as pd

In [10]:
dataset = pd.read_csv(r'C:\Users\thinkpad\Documents\GitHub\PHBS_MLF_2018\data\1year.arff.csv')
target='class'#二分类的名称
x_columns = [x for x in dataset.columns if x not in [target]]
X = dataset[x_columns]
y = dataset['class']
names = X.columns
Xarray = X.values
yarray = y.values
ranks = {}

#  Univariate feature selection

### Pearson Correlation

In [11]:
f, pval  = f_classif(Xarray, yarray)
ranks = sorted(zip(f,names),reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist1=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist1.append(Rank)
print(Ranklist1)

[23, 1, 2, 27, 60, 5, 19, 63, 29, 13, 32, 15, 47, 21, 26, 18, 64, 20, 41, 55, 44, 39, 43, 22, 12, 24, 36, 17, 7, 56, 42, 6, 16, 9, 35, 33, 46, 14, 61, 25, 37, 62, 54, 53, 48, 38, 34, 45, 51, 10, 3, 8, 30, 31, 11, 57, 4, 58, 59, 50, 52, 49, 40, 28]


### Random Forest Regression Based Ranking

# Logistic model and regularization

### L1 regularization

In [12]:
lr = LogisticRegression(C=1.0, penalty="l1", random_state=0)
lr.fit(Xarray,yarray)
lrcoef = lr.coef_[-1]
ranks = sorted(zip(np.abs(lrcoef),names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist2 =[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist2.append(Rank)
print(Ranklist2)

[5, 35, 6, 32, 63, 14, 8, 18, 22, 62, 4, 20, 23, 11, 61, 15, 29, 9, 19, 54, 49, 38, 27, 12, 41, 31, 60, 40, 13, 43, 34, 45, 33, 30, 2, 25, 59, 10, 46, 17, 36, 16, 58, 57, 48, 28, 53, 1, 24, 21, 7, 3, 42, 39, 64, 50, 37, 47, 44, 55, 51, 56, 26, 52]


### L2 regularization

In [13]:
lr = LogisticRegression(C=1.0, penalty="l2", random_state=0)
lr.fit(Xarray,yarray)
lrcoef = lr.coef_[-1]
ranks = sorted(zip(np.abs(lrcoef),names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist3=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist3.append(Rank)
print(Ranklist3)

[53, 38, 39, 22, 62, 44, 45, 20, 24, 33, 49, 31, 51, 47, 64, 29, 19, 46, 57, 5, 13, 54, 58, 42, 35, 30, 15, 23, 14, 50, 55, 9, 12, 17, 52, 21, 8, 32, 59, 36, 34, 60, 1, 2, 27, 26, 3, 61, 63, 25, 43, 48, 18, 16, 40, 56, 37, 28, 41, 6, 10, 4, 11, 7]


# Random Forest（multivariable）

In [14]:
rf = RandomForestRegressor()
rf.fit(Xarray,yarray)
ranks = sorted(zip(rf.feature_importances_ , names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist4=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist4.append(Rank)
print(Ranklist4)

[49, 59, 58, 57, 36, 19, 16, 64, 5, 48, 4, 24, 25, 9, 35, 47, 50, 15, 43, 38, 33, 6, 53, 21, 11, 34, 3, 42, 8, 46, 18, 63, 56, 1, 27, 45, 17, 14, 31, 32, 28, 2, 39, 10, 30, 7, 23, 44, 41, 40, 51, 62, 52, 55, 37, 20, 29, 22, 26, 12, 13, 61, 54, 60]


# Recursive Feature Elimination 

In [15]:
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(Xarray,yarray)
ranks = sorted(zip(rfe.ranking_ , names), reverse=False)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist5=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist5.append(Rank)
print(Ranklist5)

[1, 33, 15, 31, 61, 34, 7, 19, 23, 32, 16, 26, 8, 6, 64, 13, 21, 2, 20, 42, 43, 3, 51, 24, 27, 14, 55, 39, 17, 57, 9, 49, 11, 25, 30, 22, 52, 4, 62, 28, 35, 59, 45, 44, 48, 29, 41, 60, 63, 18, 5, 54, 37, 38, 58, 53, 36, 10, 40, 50, 47, 56, 12, 46]


# Get the mean absolute rank of each factor

In [17]:
Meanrank=[]
for i in range(64):
    meanrank = (Ranklist1[i]+Ranklist2[i]+Ranklist3[i]+Ranklist4[i]+Ranklist5[i])/5
    Meanrank.append(meanrank)

MeanRank=np.array(Meanrank)
result = sorted(zip(MeanRank , names), reverse=False)
Result = pd.DataFrame(result)
print(Result)

       0       1
0   11.8  Attr29
1   14.8  Attr38
2   16.4  Attr34
3   18.4  Attr18
4   18.8  Attr14
5   19.0   Attr7
6   20.6   Attr9
7   21.0  Attr11
8   21.8  Attr51
9   22.8  Attr50
10  23.2  Attr12
11  23.2   Attr6
12  24.0   Attr3
13  24.2  Attr24
14  24.4  Attr16
15  25.2  Attr25
16  25.6  Attr33
17  25.6  Attr46
18  26.2   Attr1
19  26.6  Attr26
20  27.6  Attr40
21  28.0  Attr22
22  28.6  Attr57
23  28.6  Attr63
24  29.2  Attr35
25  29.2  Attr36
26  30.8  Attr13
27  30.8  Attr47
28  31.6  Attr31
29  32.2  Attr28
..   ...     ...
34  33.8   Attr4
35  34.0  Attr41
36  34.4  Attr32
37  34.6  Attr60
38  34.6  Attr61
39  35.0  Attr52
40  35.8  Attr53
41  35.8  Attr54
42  36.0  Attr19
43  36.4  Attr21
44  36.4  Attr37
45  36.6  Attr17
46  36.8   Attr8
47  37.6  Attr10
48  38.6  Attr64
49  38.8  Attr20
50  39.4  Attr43
51  39.8  Attr42
52  40.2  Attr45
53  42.0  Attr55
54  42.0  Attr59
55  42.2  Attr48
56  45.2  Attr62
57  46.4  Attr23
58  47.2  Attr56
59  48.4  Attr49
60  50.0  Attr

# Get the mean relative rank of each factor

In [18]:
Meanrank_STD=[]

for i in range(64):
    meanrank = (Ranklist1[i]+Ranklist2[i]+Ranklist3[i]+Ranklist4[i]+Ranklist5[i])/5
    std = np.std([Ranklist1[i],Ranklist2[i],Ranklist3[i],Ranklist4[i],Ranklist5[i]])
    meanrank_std = meanrank*std
    Meanrank_STD.append(meanrank_std)

MeanRank_STD=np.array(Meanrank_STD)
result = sorted(zip(MeanRank_STD , names), reverse=False)
Result = pd.DataFrame(result)
print(Result)

              0       1
0     44.403135  Attr29
1    107.517440  Attr41
2    125.708895  Attr12
3    138.330526  Attr38
4    168.164653   Attr9
5    172.192190  Attr34
6    178.527416  Attr40
7    188.541280  Attr26
8    225.892661  Attr50
9    237.308117  Attr24
10   260.365970  Attr46
11   261.433782  Attr36
12   261.896926   Attr7
13   275.315046  Attr30
14   277.004040  Attr18
15   281.473108  Attr14
16   303.406326  Attr25
17   306.936095  Attr16
18   325.263669   Attr6
19   327.997741  Attr28
20   362.035786  Attr57
21   364.699328  Attr11
22   385.920000  Attr45
23   409.061020  Attr53
24   409.747462   Attr4
25   438.829104  Attr33
26   442.098632  Attr59
27   452.498405  Attr54
28   452.768734  Attr51
29   465.920000  Attr21
..          ...     ...
34   511.374618   Attr3
35   520.623786  Attr31
36   522.838505  Attr47
37   524.167912  Attr19
38   560.839371  Attr22
39   565.920000   Attr1
40   570.814155  Attr58
41   578.038094   Attr5
42   615.875642  Attr39
43   617.341480 