##  数据归一化

In [None]:
from sklearn.preprocessing import MinMaxScaler
data = [[-1,2],[-0.5,6],[0,10],[1,10]]

In [2]:
import pandas as pd
pd.DataFrame(data)

Unnamed: 0,0,1
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,10


In [3]:
scaler = MinMaxScaler()
scaler = scaler.fit(data)
res = scaler.transform(data)
res

array([[0.  , 0.  ],
       [0.25, 0.5 ],
       [0.5 , 1.  ],
       [1.  , 1.  ]])

In [4]:
res_ = scaler.fit_transform(data)
scaler.inverse_transform(res)
#归一化到（0，1）以外的范围
data = [[-1,2],[-0.5,6],[0,10],[1,10]]
scaler = MinMaxScaler(feature_range=[5,10])
result = scaler.fit_transform(data)
result

array([[ 5.  ,  5.  ],
       [ 6.25,  7.5 ],
       [ 7.5 , 10.  ],
       [10.  , 10.  ]])

## 使用Numpy实现归一化

In [5]:
import numpy as np
X = np.array([[-1,2],[-0.5,6],[0,10],[1,10]])
X

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 10. ]])

In [6]:
X_nor = (X-X.min(axis=0))/(X.max(axis=0)-X.min(axis=0))
X_nor

array([[0.  , 0.  ],
       [0.25, 0.5 ],
       [0.5 , 1.  ],
       [1.  , 1.  ]])

##  数据标准化

In [7]:
from sklearn.preprocessing import StandardScaler
data = [[-1,2],[-0.5,6],[0,10],[1,10]]

In [8]:
scaler = StandardScaler()
scaler.fit(data)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [9]:
scaler.mean_

array([-0.125,  7.   ])

In [10]:
scaler.var_

array([ 0.546875, 11.      ])

In [11]:
x_std = scaler.transform(data)
x_std

array([[-1.18321596, -1.50755672],
       [-0.50709255, -0.30151134],
       [ 0.16903085,  0.90453403],
       [ 1.52127766,  0.90453403]])

In [12]:
x_std.mean()

0.0

In [13]:
x_std.var()

1.0

## 缺失值

In [18]:
import pandas as pd
data = pd.read_csv('Narrativedata.csv',index_col=0)
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [20]:
Age = data.loc[:,'Age'].values.reshape(-1,1)
Age[:20]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [nan],
       [54.],
       [ 2.],
       [27.],
       [14.],
       [ 4.],
       [58.],
       [20.],
       [39.],
       [14.],
       [55.],
       [ 2.],
       [nan],
       [31.],
       [nan]])

In [29]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer()
imp_median = SimpleImputer(strategy='median')
imp_0 = SimpleImputer(strategy='constant',fill_value=0)

In [30]:
imp_mean = imp_mean.fit_transform(Age)
imp_median = imp_median.fit_transform(Age)
imp_0 = imp_0.fit_transform(Age)

In [31]:
imp_mean[:20]

array([[22.        ],
       [38.        ],
       [26.        ],
       [35.        ],
       [35.        ],
       [29.69911765],
       [54.        ],
       [ 2.        ],
       [27.        ],
       [14.        ],
       [ 4.        ],
       [58.        ],
       [20.        ],
       [39.        ],
       [14.        ],
       [55.        ],
       [ 2.        ],
       [29.69911765],
       [31.        ],
       [29.69911765]])

In [33]:
Embarked = data.loc[:,'Embarked'].values.reshape(-1,1)
imp_mode = SimpleImputer(strategy='most_frequent')
data.loc[:,'Embarked'] = imp_mode.fit_transform(Embarked)

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  891 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [35]:
data.loc[:,'Age'] = imp_median
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       891 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  891 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [42]:
data_ = pd.read_csv(r'Narrativedata.csv',index_col=0)
data_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [37]:
data_.loc[:,'Age'] = data.loc[:,'Age'].fillna(data_.loc[:,'Age'].median())

In [38]:
data_.loc[:,'Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

##  编码与哑变量

In [43]:
data_ = pd.read_csv(r'Narrativedata.csv',index_col=0)
data_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [49]:
from sklearn.preprocessing import LabelEncoder
y = data.iloc[:,-1]
le = LabelEncoder()
label = le.fit_transform(y)
data.iloc[:,-1] = label
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


In [47]:
le.classes_

array(['No', 'Unknown', 'Yes'], dtype=object)

In [50]:
from sklearn.preprocessing import OrdinalEncoder
data_ = data.copy()
data_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


In [51]:
OrdinalEncoder().fit(data_.iloc[:,1:-1]).categories_

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

In [52]:
data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1])
data_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,1.0,2.0,0
1,38.0,0.0,0.0,2
2,26.0,0.0,2.0,2
3,35.0,0.0,2.0,2
4,35.0,1.0,2.0,0


In [53]:
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


In [55]:
from sklearn.preprocessing import OneHotEncoder
X = data.iloc[:,1:-1]

In [56]:
enc = OneHotEncoder(categories='auto').fit(X)
res = enc.transform(X).toarray()
res

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

In [57]:
pd.DataFrame(enc.inverse_transform(res))

Unnamed: 0,0,1
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S
...,...,...
886,male,S
887,female,S
888,female,S
889,male,C


In [58]:
enc.get_feature_names()

array(['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S'], dtype=object)

In [59]:
newdata = pd.concat([data,pd.DataFrame(res)],axis=1)

In [60]:
newdata.head()

Unnamed: 0,Age,Sex,Embarked,Survived,0,1,2,3,4
0,22.0,male,S,0,0.0,1.0,0.0,0.0,1.0
1,38.0,female,C,2,1.0,0.0,1.0,0.0,0.0
2,26.0,female,S,2,1.0,0.0,0.0,0.0,1.0
3,35.0,female,S,2,1.0,0.0,0.0,0.0,1.0
4,35.0,male,S,0,0.0,1.0,0.0,0.0,1.0


In [61]:
newdata.drop(['Sex','Embarked'],axis=1,inplace=True)

In [62]:
newdata.head()

Unnamed: 0,Age,Survived,0,1,2,3,4
0,22.0,0,0.0,1.0,0.0,0.0,1.0
1,38.0,2,1.0,0.0,1.0,0.0,0.0
2,26.0,2,1.0,0.0,0.0,0.0,1.0
3,35.0,2,1.0,0.0,0.0,0.0,1.0
4,35.0,0,0.0,1.0,0.0,0.0,1.0


In [63]:
newdata.columns = ['Age','Survived','Female','Male','Embarked_C','Embarked_Q','Embarked_S']

In [64]:
newdata.head()

Unnamed: 0,Age,Survived,Female,Male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,0,0.0,1.0,0.0,0.0,1.0
1,38.0,2,1.0,0.0,1.0,0.0,0.0
2,26.0,2,1.0,0.0,0.0,0.0,1.0
3,35.0,2,1.0,0.0,0.0,0.0,1.0
4,35.0,0,0.0,1.0,0.0,0.0,1.0


## 连续型特征处理：二值化与分段（分箱）

In [66]:
data2 = data.copy()

In [69]:
#二值化
from sklearn.preprocessing import Binarizer
X = data2.iloc[:,0].values.reshape(-1,1)
transformer = Binarizer(threshold=30).fit_transform(X)
transformer

array([[0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],

In [70]:
#分箱
#参数 n_bins:分箱个数  encode:编码方式（onehot、ordinal） strategy：定义箱宽（uniform等宽
                                                             #quantile等位 kmeans聚类）
from sklearn.preprocessing import KBinsDiscretizer
x = data.iloc[:,0].values.reshape(-1,1)
est = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='uniform')
est.fit_transform(x)

set(est.fit_transform(x).ravel())

{0.0, 1.0, 2.0}

In [71]:
est = KBinsDiscretizer(n_bins=3,encode='onehot',strategy='uniform')
est.fit_transform(x).toarray()

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

#  特征工程

## 第一步：理解业务
   选择特征方法： 过滤法、嵌入法、包装法、降维法

In [72]:
import pandas as pd
data = pd.read_csv('digit recognizor.csv')

In [73]:
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
X = data.iloc[:,1:]
Y = data.iloc[:,0]

In [75]:
X.shape

(42000, 784)

这个数据量相对夸张，使用该数据集体现特征工程的重要性

### 过滤法

   通过各种统计检验方法检验

In [78]:
#方差过滤
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold()#删除方差为0的特征
x_var0 = selector.fit_transform(X)
x_var0.shape

(42000, 708)

In [79]:
import numpy as np

#使用方差中位数过滤掉一半的特征
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
X_fsvar.shape

(42000, 392)

In [80]:
#若特征是伯努利随机变量，即二分类，假设p=0.8，即某种分类占到80%以上时删除特征
X_bvar = VarianceThreshold(.8*(1-0.8)).fit_transform(X)
X_bvar.shape

(42000, 685)

###  方差过滤对模型的影响
###  KNN和随机森林在不同方差过滤效果下的对比见课件

可见方差过滤法对于最近邻算法KNN、单棵决策树、SVM、神经网络、回归算法等需要遍历特征或升维的算法有着巨大的时间提升。

### 相关性检验

### 卡方过滤

卡方过滤是专门针对离散型标签的相关性过滤 计算非负特征与标签之间的卡方统计量来选出前K个。

In [82]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [83]:
X_fschi = SelectKBest(chi2,k=300).fit_transform(X_fsvar,Y)

In [84]:
X_fschi.shape

(42000, 300)

In [86]:
cross_val_score(RFC(n_estimators=10,random_state=0),X_fschi,Y,cv=5).mean()

0.9344761904761905

### K值的选择方法
### 卡方检验会返回卡方值和P值两个统计量，P值一般用0.01或0.05作为显著性水平的边界

### P值<=0.05或0.01时，数据的差异不是自然形成的，两组数据是相关的，且拒绝"两组数据是互相独立的"的原假设

### P值>0.05或0.01时，这些差异是很自然的样本误差，两组数据是独立的，且接受原假设

In [88]:
#获得卡方值
chivalue,pvalues_chi = chi2(X_fschi,Y)
pvalues_chi

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [91]:
#K取多少？ 我们想要消除所有p值大于0.05或0.01的特征
k = chivalue.shape[0] - (pvalues_chi > 0.05).sum()
k

300

##  F检验 
###  用来捕捉每个特征与标签之间的线性关系,在使用F检验前将数据转换成服从正态分布的方式
### 和卡方一样，我们希望选取p值<0.05或0.01的特征，此时这些特征与标签有显著线性关系

In [93]:
from sklearn.feature_selection import f_classif
F,pvalues_f = f_classif(X_fsvar,Y)
pvalues_f

array([0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 4.71193533e-220,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0

In [94]:
k = F.shape[0] - (pvalues_f > 0.05).sum()
k

392

## 互信息法

In [95]:
from sklearn.feature_selection import mutual_info_classif as MIC
res = MIC(X_fsvar,Y)
res

array([0.0710571 , 0.0905134 , 0.10234554, 0.1137118 , 0.11308373,
       0.10669574, 0.08646292, 0.05787499, 0.0727058 , 0.09526811,
       0.1229851 , 0.14416091, 0.1587147 , 0.16354452, 0.15512613,
       0.12895191, 0.09039754, 0.06511514, 0.0437595 , 0.01998074,
       0.06823193, 0.09763002, 0.12315382, 0.14446035, 0.1753956 ,
       0.2057214 , 0.22486789, 0.2368541 , 0.21230198, 0.18122938,
       0.14507989, 0.10216141, 0.07508401, 0.06290319, 0.0435529 ,
       0.03634107, 0.0574674 , 0.07920195, 0.10285394, 0.12582467,
       0.13474193, 0.14972095, 0.16627092, 0.1809702 , 0.18815739,
       0.17381212, 0.15199213, 0.13632227, 0.11856176, 0.09239249,
       0.08307583, 0.0673328 , 0.05874591, 0.04994503, 0.06721987,
       0.06861094, 0.0883413 , 0.0996729 , 0.11039987, 0.10658101,
       0.10457963, 0.11292875, 0.11967756, 0.11671268, 0.11314019,
       0.11730469, 0.11525121, 0.10881448, 0.09323867, 0.07917485,
       0.07260919, 0.04304431, 0.04952909, 0.06601812, 0.07465

In [97]:
k = res.shape[0] - sum(res <= 0)
k

392

## 过滤法总结
### 建议先使用方差过滤，再使用互信息法来捕捉相关性

# 嵌入法Embedded

In [98]:
#参数 estimator:带feature_importances_或者coef_属性的模型都可使用
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier as RFC
RFC_ = RFC(n_estimators = 10,random_state=0)

In [99]:
X_embedded = SelectFromModel(RFC_,threshold=0.005).fit_transform(X,Y)
#780个特征 分配后每个特征的特征重要性都会很低，所以0.005是个很大的阈值

In [100]:
X_embedded.shape

(42000, 47)

In [101]:
#threshold的选择： 学习曲线
import numpy as np
import matplotlib.pyplot as plt

RFC_.fit(X,Y).feature_importances_
threshold = np.linspace(0,(RFC_.fit(X,Y).feature_importances_).max(),20)

score = []
for i in threshold:
    x_embedded = SelectFromModel(RFC_,threshold=i).fit_transform(X,Y)
    once = cross_val_score(RFC_,x_embedded,Y,cv=5).mean()
    score.append(once)
plt.plot(threshold,score)
plt.show()

由于嵌入法比方差过滤更加具体到模型的表现，所以得出的特征训练的模型效果高于方差过滤。

## 总结

嵌入法下，我们很容易就能够实现特征选择的目标：减少计算量，提升模型表现。因此，比起要思考很多统计量的过滤法来说，嵌入法可能是更有效的一种方法。只是计算会比过滤法慢很多，所以大型数据中还是会优先考虑过滤法。结合了过滤法和嵌入法，产生了包装法

# 包装法wrapper
包装法的效果是所有特征选择方法中最利于提升模型表现的，它可以使用很少的特征达到很优秀的效果。在特征数目相同时，包装法和嵌入法的效果可以匹敌，且比嵌入法更快，虽然也有十分庞大的计算量不适用与太大型的数据。

In [102]:
from sklearn.feature_selection import RFE
RFC_ = RFC(n_estimators=10,random_state=0)
selector = RFE(RFC_,n_features_to_select=340,step=50).fit(X,Y)
selector.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False,  True,
        True,  True,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True,  True,  True,
        True,  True,

In [103]:
selector.support_.sum()

340

In [104]:
selector.ranking_

array([10,  9,  8,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  6,  6,
        5,  6,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  6,  7,  7,
        7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  5,  4,
        4,  5,  3,  4,  4,  4,  5,  4,  5,  7,  6,  7,  7,  7,  8,  8,  8,
        8,  8,  8,  8,  8,  6,  7,  4,  3,  1,  2,  3,  3,  1,  1,  1,  1,
        1,  3,  3,  4,  5,  5,  5,  8,  8,  9,  9,  9,  9,  8,  9,  9,  4,
        4,  3,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  3,  3,  4,
        5,  5,  9,  9, 10, 10, 10, 10,  7,  4,  4,  3,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  3,  3,  5,  8, 10, 10, 10,
       10,  9,  4,  4,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  3,  4, 10, 10, 10, 10,  9,  7,  4,  3,  2,  2,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,
        4,  4, 10,  9, 10

In [108]:
X_wrapper = selector.transform(X)
cross_val_score(RFC_,X_wrapper,Y,cv=5).mean()

0.9379761904761905

# 特征选择总结
过滤法：通过各种统计检验方法选择重要的特征

嵌入法：具体到模型的表现和特征处理算法的选择，依赖于算法自身的feature_importances或coef_

包装法：同样依赖于嵌入法依赖的两个属性。不同的是我们往往使用一个目标函数（RFE，递归特征消除法）来帮助选择特征，而不是自己输入某个指标或阈值。每过一轮舍弃一些特征

经验来说，过滤法更快速，但更粗糙。包装法和嵌入法更准确，比较适合具体到算法去调整，但计算量较大，运行时间长。数据量很大时，优先使用方差过滤和互信息法调整，再用其他特征选择方法。逻辑回归时，优先使用嵌入法。SVM时，优先使用包装法。迷茫时从过滤法走起。