In [1]:
from sklearn.datasets import load_boston
from sklearn.linear_model import (LinearRegression, Ridge, 
                                  Lasso, RandomizedLasso)
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score, ShuffleSplit
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv(r'C:\Users\thinkpad\Documents\GitHub\PHBS_MLF_2018\data\2year.arff.csv')
target='class'#二分类的名称
x_columns = [x for x in dataset.columns if x not in [target]]
X = dataset[x_columns]
y = dataset['class']
names = X.columns
Xarray = X.values
yarray = y.values
ranks = {}

#  Univariate feature selection

### Pearson Correlation

In [3]:
f, pval  = f_classif(Xarray, yarray)
ranks = sorted(zip(f,names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist1=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist1.append(Rank)
print(Ranklist1)

[15, 8, 6, 45, 18, 3, 31, 56, 54, 10, 33, 27, 50, 30, 13, 20, 57, 25, 61, 29, 39, 28, 59, 14, 5, 23, 21, 19, 11, 63, 60, 53, 47, 55, 22, 58, 40, 9, 36, 62, 16, 34, 35, 37, 49, 38, 48, 26, 24, 44, 7, 51, 2, 1, 12, 41, 43, 32, 46, 64, 52, 17, 42, 4]


### Random Forest Regression Based Ranking

# Logistic model and regularization

### L1 regularization

In [4]:
lr = LogisticRegression(C=1.0, penalty="l1", random_state=0)
lr.fit(Xarray,yarray)
lrcoef = lr.coef_[-1]
ranks = sorted(zip(np.abs(lrcoef),names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist2 =[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist2.append(Rank)
print(Ranklist2)

[4, 6, 17, 31, 56, 24, 60, 44, 14, 64, 16, 27, 30, 63, 58, 21, 50, 62, 61, 43, 12, 2, 18, 5, 20, 33, 54, 36, 29, 39, 26, 55, 28, 35, 3, 13, 53, 7, 10, 9, 41, 19, 47, 51, 49, 8, 45, 1, 23, 38, 15, 32, 40, 34, 59, 11, 37, 22, 46, 57, 42, 52, 25, 48]


### L2 regularization

In [5]:
lr = LogisticRegression(C=1.0, penalty="l2", random_state=0)
lr.fit(Xarray,yarray)
lrcoef = lr.coef_[-1]
ranks = sorted(zip(np.abs(lrcoef),names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist3=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist3.append(Rank)
print(Ranklist3)

[57, 38, 43, 21, 42, 55, 54, 17, 29, 37, 46, 35, 39, 53, 64, 32, 16, 50, 51, 5, 28, 48, 49, 45, 41, 33, 20, 19, 22, 59, 56, 2, 14, 18, 47, 27, 6, 36, 62, 30, 34, 60, 1, 3, 24, 25, 4, 63, 61, 23, 40, 44, 13, 12, 31, 58, 52, 15, 26, 7, 8, 9, 11, 10]


# Random Forest（multivariable）

In [6]:
rf = RandomForestRegressor()
rf.fit(Xarray,yarray)
ranks = sorted(zip(rf.feature_importances_ , names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist4=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist4.append(Rank)
print(Ranklist4)

[50, 61, 49, 43, 5, 14, 64, 62, 23, 58, 48, 47, 32, 59, 11, 41, 60, 63, 44, 21, 17, 8, 39, 18, 25, 52, 2, 56, 26, 35, 42, 46, 27, 1, 33, 22, 20, 53, 24, 9, 19, 3, 34, 4, 38, 7, 31, 54, 30, 28, 51, 57, 37, 55, 15, 10, 13, 6, 29, 16, 12, 45, 40, 36]


# Recursive Feature Elimination 

In [7]:
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(Xarray,yarray)
ranks = sorted(zip(rfe.ranking_ , names), reverse=False)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist5=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist5.append(Rank)
print(Ranklist5)

[57, 38, 42, 8, 45, 56, 54, 20, 25, 37, 46, 33, 34, 53, 64, 28, 15, 52, 49, 1, 26, 50, 48, 43, 41, 31, 30, 19, 2, 58, 51, 17, 3, 10, 47, 23, 12, 36, 62, 27, 32, 60, 7, 6, 14, 22, 16, 63, 61, 21, 40, 44, 24, 11, 39, 59, 55, 4, 29, 35, 9, 18, 5, 13]


# Get the mean absolute rank of each factor

In [9]:
Meanrank=[]
for i in range(64):
    meanrank = (Ranklist1[i]+Ranklist2[i]+Ranklist3[i]+Ranklist4[i]+Ranklist5[i])/5
    Meanrank.append(meanrank)

MeanRank=np.array(Meanrank)
result = sorted(zip(MeanRank , names), reverse=False)
Result = pd.DataFrame(result)
print(Result)

       0       1
0   15.8  Attr58
1   18.0  Attr29
2   19.8  Attr20
3   20.0  Attr46
4   20.2  Attr44
5   22.2  Attr64
6   22.6  Attr54
7   23.2  Attr53
8   23.8  Attr33
9   23.8  Attr34
10  24.4  Attr21
11  24.6  Attr61
12  24.6  Attr63
13  24.8  Attr43
14  25.0  Attr24
15  25.4  Attr27
16  26.2  Attr37
17  26.4  Attr25
18  27.2  Attr22
19  27.4  Attr40
20  28.2  Attr38
21  28.2  Attr62
22  28.4  Attr16
23  28.4  Attr41
24  28.6  Attr36
25  28.8  Attr47
26  29.0   Attr9
27  29.6   Attr4
28  29.8  Attr28
29  30.2   Attr2
..   ...     ...
34  31.2  Attr55
35  31.4   Attr3
36  33.2   Attr5
37  33.8  Attr12
38  34.4  Attr26
39  34.6  Attr32
40  34.8  Attr45
41  35.2  Attr42
42  35.2  Attr59
43  35.8  Attr56
44  35.8  Attr60
45  36.6   Attr1
46  37.0  Attr13
47  37.8  Attr11
48  38.8  Attr39
49  39.6  Attr17
50  39.8  Attr49
51  39.8   Attr8
52  40.0  Attr57
53  41.2  Attr10
54  41.4  Attr48
55  42.0  Attr15
56  42.6  Attr23
57  45.6  Attr52
58  47.0  Attr31
59  50.4  Attr18
60  50.8  Attr

# Get the mean relative rank of each factor

In [10]:
Meanrank_STD=[]

for i in range(64):
    meanrank = (Ranklist1[i]+Ranklist2[i]+Ranklist3[i]+Ranklist4[i]+Ranklist5[i])/5
    std = np.std([Ranklist1[i],Ranklist2[i],Ranklist3[i],Ranklist4[i],Ranklist5[i]])
    meanrank_std = meanrank*std
    Meanrank_STD.append(meanrank_std)

MeanRank_STD=np.array(Meanrank_STD)
result = sorted(zip(MeanRank_STD , names), reverse=False)
Result = pd.DataFrame(result)
print(Result)

              0       1
0    163.711180  Attr58
1    181.076779  Attr29
2    218.956449  Attr16
3    228.162833  Attr21
4    230.824609  Attr46
5    247.825694  Attr12
6    264.750449  Attr13
7    267.985265  Attr41
8    272.297085  Attr50
9    306.331021  Attr20
10   312.785227  Attr59
11   328.227199  Attr26
12   332.464985  Attr53
13   352.175670  Attr33
14   359.505580  Attr25
15   359.877455  Attr19
16   366.399638  Attr63
17   374.226421  Attr64
18   381.080703  Attr52
19   389.507895   Attr9
20   402.181551  Attr24
21   403.069729  Attr44
22   409.978700   Attr4
23   430.333274  Attr27
24   436.913125  Attr28
25   438.728753  Attr43
26   439.766036  Attr54
27   440.328557  Attr36
28   456.910414  Attr34
29   457.745124  Attr61
..          ...     ...
34   484.456750  Attr47
35   497.280434  Attr38
36   504.896320  Attr35
37   510.790907  Attr51
38   527.146058   Attr3
39   531.475251  Attr40
40   534.021519  Attr55
41   538.092727  Attr22
42   568.680578  Attr31
43   582.230942 