In [1]:
from sklearn.datasets import load_boston
from sklearn.linear_model import (LinearRegression, Ridge, 
                                  Lasso, RandomizedLasso)
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score, ShuffleSplit
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv(r'C:\Users\thinkpad\Documents\GitHub\PHBS_MLF_2018\data\4year.arff.csv')
target='class'#二分类的名称
x_columns = [x for x in dataset.columns if x not in [target]]
X = dataset[x_columns]
y = dataset['class']
names = X.columns
Xarray = X.values
yarray = y.values
ranks = {}

#  Univariate feature selection

### Pearson Correlation

In [3]:
f, pval  = f_classif(Xarray, yarray)
ranks = sorted(zip(f,names), reverse=False)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist1=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist1.append(Rank)
print(Ranklist1)

[64, 45, 41, 21, 16, 40, 60, 22, 5, 46, 59, 54, 32, 61, 1, 52, 24, 55, 2, 48, 50, 58, 9, 3, 47, 53, 37, 26, 62, 38, 7, 28, 14, 8, 63, 25, 43, 44, 56, 18, 35, 36, 17, 19, 12, 23, 29, 51, 42, 20, 39, 15, 34, 27, 49, 57, 13, 4, 31, 30, 6, 11, 33, 10]


### Random Forest Regression Based Ranking

# Logistic model and regularization

### L1 regularization

In [4]:
lr = LogisticRegression(C=1.0, penalty="l1", random_state=0)
lr.fit(Xarray,yarray)
lrcoef = lr.coef_[-1]
ranks = sorted(zip(np.abs(lrcoef),names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist2 =[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist2.append(Rank)
print(Ranklist2)

[64, 7, 10, 27, 59, 44, 5, 60, 17, 9, 1, 23, 34, 6, 61, 18, 35, 4, 22, 48, 33, 63, 62, 42, 28, 20, 56, 39, 16, 38, 25, 50, 32, 29, 3, 15, 46, 11, 14, 13, 41, 21, 58, 52, 45, 12, 49, 2, 24, 30, 8, 26, 53, 40, 57, 19, 37, 43, 36, 47, 51, 54, 31, 55]


### L2 regularization

In [5]:
lr = LogisticRegression(C=1.0, penalty="l2", random_state=0)
lr.fit(Xarray,yarray)
lrcoef = lr.coef_[-1]
ranks = sorted(zip(np.abs(lrcoef),names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist3=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist3.append(Rank)
print(Ranklist3)

[57, 38, 47, 15, 59, 54, 56, 12, 28, 37, 50, 29, 44, 55, 61, 25, 11, 51, 64, 3, 32, 53, 62, 45, 42, 27, 48, 23, 14, 24, 58, 2, 10, 13, 52, 26, 6, 36, 22, 30, 31, 60, 4, 17, 40, 20, 1, 63, 46, 19, 41, 33, 18, 16, 43, 21, 49, 34, 35, 5, 8, 39, 9, 7]


# Random Forest（multivariable）

In [6]:
rf = RandomForestRegressor()
rf.fit(Xarray,yarray)
ranks = sorted(zip(rf.feature_importances_ , names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist4=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist4.append(Rank)
print(Ranklist4)

[58, 48, 34, 30, 19, 17, 61, 59, 32, 57, 45, 53, 21, 63, 25, 36, 52, 60, 54, 39, 1, 6, 47, 11, 22, 26, 3, 51, 4, 46, 43, 56, 41, 2, 8, 16, 28, 23, 13, 15, 24, 5, 37, 18, 44, 9, 38, 33, 31, 50, 35, 64, 49, 40, 12, 10, 20, 7, 42, 29, 14, 55, 62, 27]


# Recursive Feature Elimination 

In [7]:
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(Xarray,yarray)
ranks = sorted(zip(rfe.ranking_ , names), reverse=False)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist5=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist5.append(Rank)
print(Ranklist5)

[58, 4, 17, 25, 57, 36, 6, 20, 13, 1, 2, 28, 47, 3, 63, 11, 19, 7, 64, 42, 30, 18, 61, 38, 27, 12, 54, 34, 10, 45, 59, 53, 24, 32, 5, 14, 39, 15, 23, 8, 40, 60, 49, 48, 50, 9, 37, 62, 55, 44, 16, 29, 43, 33, 52, 22, 35, 21, 31, 41, 51, 56, 26, 46]


# Get the mean relative rank of each factor

In [9]:
Meanrank=[]
for i in range(64):
    meanrank = (Ranklist1[i]+Ranklist2[i]+Ranklist3[i]+Ranklist4[i]+Ranklist5[i])/5
    Meanrank.append(meanrank)

MeanRank=np.array(Meanrank)
result = sorted(zip(MeanRank , names), reverse=False)
Result = pd.DataFrame(result)
print(Result)

       0       1
0   14.6  Attr46
1   16.8  Attr34
2   16.8  Attr40
3   19.0   Attr9
4   19.2  Attr36
5   21.2  Attr29
6   21.8  Attr58
7   23.6   Attr4
8   24.2  Attr33
9   25.6  Attr39
10  25.8  Attr38
11  25.8  Attr56
12  26.0  Attr61
13  26.2  Attr35
14  27.6  Attr26
15  27.8  Attr24
16  27.8  Attr51
17  28.2  Attr17
18  28.4  Attr16
19  28.4   Attr2
20  29.0  Attr64
21  29.2  Attr21
22  29.8   Attr3
23  30.0  Attr10
24  30.4  Attr60
25  30.8  Attr44
26  30.8  Attr47
27  30.8  Attr57
28  31.2  Attr54
29  31.4  Attr11
..   ...     ...
34  33.2  Attr25
35  33.4  Attr52
36  34.2  Attr41
37  34.6  Attr28
38  34.6   Attr8
39  35.0  Attr59
40  35.4  Attr18
41  35.6  Attr13
42  36.0  Attr20
43  36.4  Attr42
44  37.4  Attr12
45  37.6  Attr14
46  37.6   Attr7
47  37.8  Attr32
48  38.2  Attr30
49  38.2  Attr45
50  38.2   Attr6
51  38.4  Attr31
52  39.4  Attr53
53  39.6  Attr22
54  39.6  Attr27
55  39.6  Attr49
56  41.2  Attr19
57  42.0   Attr5
58  42.2  Attr15
59  42.2  Attr48
60  42.6  Attr

# Get the mean relative rank of each factor

In [10]:
Meanrank_STD=[]

for i in range(64):
    meanrank = (Ranklist1[i]+Ranklist2[i]+Ranklist3[i]+Ranklist4[i]+Ranklist5[i])/5
    std = np.std([Ranklist1[i],Ranklist2[i],Ranklist3[i],Ranklist4[i],Ranklist5[i]])
    meanrank_std = meanrank*std
    Meanrank_STD.append(meanrank_std)

MeanRank_STD=np.array(Meanrank_STD)
result = sorted(zip(MeanRank_STD , names), reverse=False)
Result = pd.DataFrame(result)
print(Result)

              0       1
0     84.931351  Attr46
1     99.692198  Attr36
2    122.720000   Attr4
3    123.637043  Attr40
4    141.739197  Attr59
5    187.321115   Attr9
6    188.070812   Attr1
7    197.469690  Attr34
8    213.469376  Attr41
9    275.667638  Attr33
10   281.423308  Attr54
11   300.204368  Attr30
12   318.027581  Attr25
13   321.993923  Attr38
14   328.421178  Attr58
15   329.833568  Attr13
16   345.029840  Attr28
17   369.394952  Attr51
18   380.279527  Attr26
19   394.624812  Attr57
20   400.956006  Attr17
21   402.693777  Attr39
22   408.008243  Attr50
23   409.038881  Attr16
24   417.099113  Attr56
25   420.338481   Attr3
26   434.229845  Attr49
27   437.422092  Attr60
28   441.123047  Attr29
29   461.840256  Attr21
..          ...     ...
34   491.221781  Attr53
35   497.630858  Attr12
36   499.074061  Attr47
37   515.006274  Attr45
38   535.120173  Attr61
39   539.629894   Attr2
40   548.655366  Attr52
41   551.513719  Attr63
42   552.371976  Attr64
43   607.109875 