In [1]:
from sklearn.datasets import load_boston
from sklearn.linear_model import (LinearRegression, Ridge, 
                                  Lasso, RandomizedLasso)
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score, ShuffleSplit
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv(r'C:\Users\thinkpad\Documents\GitHub\PHBS_MLF_2018\data\3year.arff.csv')
target='class'#二分类的名称
x_columns = [x for x in dataset.columns if x not in [target]]
X = dataset[x_columns]
y = dataset['class']
names = X.columns
Xarray = X.values
yarray = y.values
ranks = {}

#  Univariate feature selection

### Pearson Correlation

In [3]:
f, pval  = f_classif(Xarray, yarray)
ranks = sorted(zip(f,names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist1=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist1.append(Rank)
print(Ranklist1)

[14, 5, 7, 54, 46, 10, 13, 31, 60, 4, 15, 21, 59, 12, 32, 28, 30, 17, 37, 58, 44, 2, 36, 9, 3, 29, 25, 34, 11, 23, 35, 64, 18, 38, 1, 19, 45, 6, 62, 43, 50, 56, 49, 55, 51, 47, 39, 16, 48, 53, 8, 42, 33, 26, 20, 57, 22, 40, 63, 41, 27, 24, 52, 61]


### Random Forest Regression Based Ranking

# Logistic model and regularization

### L1 regularization

In [4]:
lr = LogisticRegression(C=1.0, penalty="l1", random_state=0)
lr.fit(Xarray,yarray)
lrcoef = lr.coef_[-1]
ranks = sorted(zip(np.abs(lrcoef),names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist2 =[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist2.append(Rank)
print(Ranklist2)

[6, 19, 14, 26, 58, 24, 60, 49, 16, 8, 15, 28, 22, 64, 59, 33, 43, 63, 12, 45, 3, 1, 36, 11, 20, 29, 56, 37, 23, 39, 10, 50, 21, 34, 4, 17, 55, 13, 62, 9, 40, 30, 46, 48, 52, 5, 53, 2, 61, 32, 7, 31, 41, 35, 57, 25, 38, 27, 44, 51, 42, 54, 18, 47]


### L2 regularization

In [5]:
lr = LogisticRegression(C=1.0, penalty="l2", random_state=0)
lr.fit(Xarray,yarray)
lrcoef = lr.coef_[-1]
ranks = sorted(zip(np.abs(lrcoef),names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist3=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist3.append(Rank)
print(Ranklist3)

[61, 40, 44, 19, 46, 50, 57, 12, 33, 38, 49, 30, 39, 56, 36, 31, 11, 55, 51, 5, 16, 52, 48, 45, 41, 34, 10, 20, 26, 62, 53, 6, 14, 18, 54, 29, 8, 37, 59, 32, 21, 60, 1, 2, 27, 23, 3, 64, 47, 24, 43, 42, 22, 17, 15, 63, 58, 25, 35, 4, 9, 28, 13, 7]


# Random Forest（multivariable）

In [6]:
rf = RandomForestRegressor()
rf.fit(Xarray,yarray)
ranks = sorted(zip(rf.feature_importances_ , names), reverse=True)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist4=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist4.append(Rank)
print(Ranklist4)

[63, 60, 57, 47, 11, 12, 59, 52, 36, 53, 50, 48, 27, 64, 15, 34, 58, 61, 55, 25, 5, 4, 62, 9, 20, 14, 2, 54, 16, 39, 49, 42, 28, 1, 19, 22, 17, 31, 21, 18, 10, 23, 44, 24, 45, 3, 35, 40, 46, 29, 30, 56, 32, 43, 7, 6, 26, 8, 38, 37, 13, 51, 41, 33]


# Recursive Feature Elimination 

In [7]:
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(Xarray,yarray)
ranks = sorted(zip(rfe.ranking_ , names), reverse=False)
Ranks = pd.DataFrame(ranks)
Ranks['namename'] = Ranks.iloc[:,1]
Ranklist5=[]
for name in names:
    rank = Ranks[(Ranks.namename==name)].index.tolist()
    Rank = rank[-1]+1
    Ranklist5.append(Rank)
print(Ranklist5)

[54, 2, 57, 21, 59, 56, 25, 33, 27, 1, 30, 17, 16, 24, 58, 26, 35, 22, 50, 40, 3, 9, 15, 10, 11, 18, 51, 29, 12, 55, 60, 49, 14, 36, 4, 34, 52, 8, 61, 5, 37, 62, 42, 41, 45, 6, 48, 64, 32, 19, 7, 23, 31, 28, 53, 63, 38, 20, 39, 46, 43, 47, 13, 44]


# Get the mean relative rank of each factor

In [9]:
Meanrank=[]
for i in range(64):
    meanrank = (Ranklist1[i]+Ranklist2[i]+Ranklist3[i]+Ranklist4[i]+Ranklist5[i])/5
    Meanrank.append(meanrank)

MeanRank=np.array(Meanrank)
result = sorted(zip(MeanRank , names), reverse=False)
Result = pd.DataFrame(result)
print(Result)

       0       1
0   13.6  Attr22
1   14.2  Attr21
2   16.4  Attr35
3   16.8  Attr24
4   16.8  Attr46
5   17.6  Attr29
6   19.0  Attr25
7   19.0  Attr33
8   19.0  Attr38
9   19.0  Attr51
10  20.8  Attr10
11  21.4  Attr40
12  24.0  Attr58
13  24.2  Attr36
14  24.8  Attr26
15  25.2   Attr2
16  25.4  Attr34
17  26.8  Attr61
18  27.4  Attr63
19  28.8  Attr12
20  28.8  Attr27
21  29.8  Attr54
22  30.4  Attr16
23  30.4  Attr55
24  30.4   Attr6
25  31.4  Attr50
26  31.6  Attr41
27  31.8  Attr11
28  31.8  Attr53
29  32.6  Attr13
..   ...     ...
34  34.8  Attr28
35  35.4  Attr17
36  35.4  Attr37
37  35.4   Attr8
38  35.6  Attr47
39  35.8   Attr3
40  35.8  Attr60
41  36.4  Attr43
42  36.4  Attr57
43  37.2  Attr48
44  38.4  Attr64
45  38.8  Attr52
46  39.4  Attr23
47  39.6   Attr1
48  40.0  Attr15
49  40.8  Attr62
50  41.0  Attr19
51  41.4  Attr31
52  42.2  Attr32
53  42.8  Attr56
54  42.8   Attr7
55  43.6  Attr18
56  43.6  Attr30
57  43.8  Attr59
58  44.0  Attr14
59  44.0  Attr45
60  44.0   Att

# Get the mean relative rank of each factor

In [10]:
Meanrank_STD=[]

for i in range(64):
    meanrank = (Ranklist1[i]+Ranklist2[i]+Ranklist3[i]+Ranklist4[i]+Ranklist5[i])/5
    std = np.std([Ranklist1[i],Ranklist2[i],Ranklist3[i],Ranklist4[i],Ranklist5[i]])
    meanrank_std = meanrank*std
    Meanrank_STD.append(meanrank_std)

MeanRank_STD=np.array(Meanrank_STD)
result = sorted(zip(MeanRank_STD , names), reverse=False)
Result = pd.DataFrame(result)
print(Result)

             0       1
0    91.402442  Attr16
1    99.091877  Attr33
2   104.775448  Attr29
3   154.121895  Attr36
4   186.511829  Attr26
5   192.278273  Attr53
6   222.428394  Attr21
7   237.207433  Attr24
8   239.430157  Attr38
9   241.232668  Attr25
10  248.953008  Attr58
11  260.745743  Attr54
12  263.797935  Attr22
13  280.796308  Attr46
14  282.583085  Attr51
15  304.632589  Attr40
16  307.715567  Attr12
17  325.133878  Attr35
18  358.347103  Attr34
19  366.506740  Attr50
20  379.160808  Attr61
21  389.635694  Attr28
22  395.510809  Attr45
23  430.356474  Attr49
24  433.399334  Attr10
25  434.421406  Attr52
26  439.224688  Attr59
27  440.654702  Attr63
28  450.718367  Attr41
29  456.729354  Attr57
..         ...     ...
34  503.941940  Attr62
35  508.874230   Attr8
36  546.858393  Attr17
37  553.804043   Attr2
38  582.411956   Attr6
39  593.632659  Attr60
40  596.284796  Attr30
41  611.195476  Attr23
42  620.210524  Attr27
43  622.989828  Attr47
44  624.673770  Attr55
45  629.149