In [13]:
from sklearn.datasets import load_boston
from sklearn.linear_model import (LinearRegression, Ridge, 
                                  Lasso, RandomizedLasso)
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score, ShuffleSplit
import numpy as np
import pandas as pd

In [14]:
dataset = pd.read_csv(r'C:\Users\thinkpad\Documents\GitHub\PHBS_MLF_2018\data\5year.arff.csv')
target='class'#二分类的名称
x_columns = [x for x in dataset.columns if x not in [target]]
X = dataset[x_columns]
y = dataset['class']
names = X.columns
Xarray = X.values
yarray = y.values
ranks = {}

#  Univariate feature selection

### Pearson Correlation

In [15]:
f, pval  = f_classif(Xarray, yarray)
ranks = sorted(zip(f,names), reverse=False)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist1=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist1.append(Rank)
print(Ranklist1)

[53, 38, 62, 1, 37, 44, 39, 8, 42, 16, 52, 25, 17, 40, 19, 22, 9, 41, 34, 50, 11, 48, 33, 45, 30, 23, 27, 14, 64, 55, 29, 59, 26, 15, 54, 46, 24, 13, 61, 4, 12, 32, 57, 51, 6, 2, 10, 47, 35, 3, 63, 36, 20, 21, 43, 60, 56, 58, 28, 5, 31, 49, 18, 7]


### Random Forest Regression Based Ranking

# Logistic model and regularization

### L1 regularization

In [16]:
lr = LogisticRegression(C=1.0, penalty="l1", random_state=0)
lr.fit(Xarray,yarray)
lrcoef = lr.coef_[-1]
ranks = sorted(zip(np.abs(lrcoef),names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist2 =[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist2.append(Rank)
print(Ranklist2)

[13, 22, 2, 49, 58, 24, 16, 44, 10, 34, 12, 40, 33, 28, 64, 32, 42, 21, 29, 47, 15, 7, 30, 23, 14, 39, 59, 56, 4, 31, 20, 62, 35, 50, 1, 25, 46, 11, 6, 41, 36, 18, 52, 45, 61, 43, 55, 3, 9, 51, 5, 19, 53, 48, 60, 8, 27, 17, 26, 63, 37, 57, 38, 54]


### L2 regularization

In [17]:
lr = LogisticRegression(C=1.0, penalty="l2", random_state=0)
lr.fit(Xarray,yarray)
lrcoef = lr.coef_[-1]
ranks = sorted(zip(np.abs(lrcoef),names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist3=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist3.append(Rank)
print(Ranklist3)

[41, 45, 51, 18, 61, 37, 47, 16, 24, 39, 40, 26, 32, 46, 64, 27, 12, 43, 57, 3, 21, 44, 59, 34, 49, 30, 33, 15, 14, 63, 56, 13, 10, 17, 42, 25, 5, 36, 58, 29, 23, 60, 1, 4, 19, 22, 2, 48, 62, 20, 50, 52, 54, 35, 38, 55, 28, 31, 53, 7, 8, 6, 9, 11]


# Random Forest（multivariable）

In [18]:
rf = RandomForestRegressor()
rf.fit(Xarray,yarray)
ranks = sorted(zip(rf.feature_importances_ , names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist4=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist4.append(Rank)
print(Ranklist4)

[40, 62, 31, 46, 19, 25, 43, 41, 17, 49, 21, 36, 32, 57, 15, 37, 64, 55, 61, 56, 1, 6, 60, 7, 20, 53, 3, 38, 13, 23, 59, 47, 52, 2, 4, 26, 30, 63, 10, 12, 8, 5, 34, 28, 22, 9, 29, 35, 45, 51, 33, 48, 54, 44, 14, 11, 24, 16, 50, 42, 27, 39, 58, 18]


# Recursive Feature Elimination 

In [19]:
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(Xarray,yarray)
ranks = sorted(zip(rfe.ranking_ , names), reverse=False)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist5=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist5.append(Rank)
print(Ranklist5)

[27, 2, 7, 20, 60, 33, 31, 23, 30, 1, 16, 21, 42, 17, 64, 37, 28, 25, 39, 49, 22, 26, 47, 32, 9, 18, 57, 40, 8, 63, 53, 58, 14, 55, 3, 24, 43, 4, 10, 12, 44, 61, 50, 48, 51, 13, 45, 5, 62, 19, 6, 36, 54, 41, 59, 11, 34, 29, 35, 52, 38, 56, 15, 46]


# Get the mean relative rank of each factor

In [20]:
Meanrank=[]
for i in range(64):
    meanrank = (Ranklist1[i]+Ranklist2[i]+Ranklist3[i]+Ranklist4[i]+Ranklist5[i])/5
    Meanrank.append(meanrank)

MeanRank=np.array(Meanrank)
result = sorted(zip(MeanRank , names), reverse=False)
Result = pd.DataFrame(result)
print(Result)

       0       1
0   14.0  Attr21
1   17.8  Attr46
2   19.6  Attr40
3   20.6  Attr29
4   20.8  Attr35
5   24.4  Attr25
6   24.6  Attr41
7   24.6   Attr9
8   25.4  Attr38
9   26.2  Attr22
10  26.4   Attr8
11  26.8   Attr4
12  27.2  Attr64
13  27.4  Attr33
14  27.6  Attr48
15  27.6  Attr63
16  27.8  Attr10
17  27.8  Attr34
18  28.2  Attr11
19  28.2  Attr24
20  28.2  Attr47
21  28.2  Attr61
22  28.8  Attr50
23  29.0  Attr39
24  29.0  Attr56
25  29.2  Attr36
26  29.6  Attr12
27  29.6  Attr37
28  30.2  Attr58
29  30.6   Attr3
..   ...     ...
34  31.8  Attr45
35  32.6  Attr26
36  32.6  Attr28
37  32.6   Attr6
38  33.8   Attr2
39  33.8  Attr57
40  33.8  Attr60
41  34.8   Attr1
42  35.2  Attr42
43  35.2  Attr44
44  35.2   Attr7
45  35.8  Attr27
46  37.0  Attr18
47  37.6  Attr14
48  37.8  Attr54
49  38.2  Attr52
50  38.4  Attr59
51  38.8  Attr43
52  41.0  Attr20
53  41.4  Attr62
54  42.6  Attr49
55  42.8  Attr55
56  43.4  Attr31
57  44.0  Attr19
58  45.2  Attr15
59  45.8  Attr23
60  47.0  Attr

# Get the mean relative rank of each factor

In [21]:
for i in range(64):
    meanrank = (Ranklist1[i]+Ranklist2[i]+Ranklist3[i]+Ranklist4[i]+Ranklist5[i])/5
    std = np.std([Ranklist1[i],Ranklist2[i],Ranklist3[i],Ranklist4[i],Ranklist5[i]])
    meanrank_std = meanrank*std
    Meanrank_STD.append(meanrank_std)

MeanRank_STD=np.array(Meanrank_STD)
result = sorted(zip(MeanRank_STD , names), reverse=False)
Result = pd.DataFrame(result)
print(Result)

           0       1
0   0.912022  Attr29
1   1.010493  Attr35
2   1.085630  Attr39
3   1.208312  Attr56
4   1.231522  Attr46
5   1.331745  Attr34
6   1.346957  Attr60
7   1.364583  Attr48
8   1.386967   Attr3
9   1.419040  Attr22
10  1.425620  Attr51
11  1.456696  Attr40
12  1.471780  Attr47
13  1.500624  Attr45
14  1.505290  Attr64
15  1.508498   Attr4
16  1.516314  Attr50
17  1.534408  Attr38
18  1.641153   Attr2
19  1.675786  Attr10
20  1.683318  Attr17
21  1.726324   Attr8
22  1.726911  Attr27
23  1.745352  Attr41
24  1.745427  Attr25
25  1.791206  Attr63
26  1.808906  Attr33
27  1.831984  Attr21
28  1.840819  Attr42
29  1.871413  Attr43
..       ...     ...
34  2.034907  Attr11
35  2.035109  Attr28
36  2.041671  Attr49
37  2.060738  Attr20
38  2.147502  Attr24
39  2.265770  Attr62
40  2.285350   Attr9
41  2.371998  Attr57
42  2.433976  Attr13
43  2.478713  Attr32
44  2.515576   Attr1
45  2.594165  Attr61
46  2.643314   Attr7
47  2.661180  Attr14
48  2.680388  Attr55
49  2.787276 