# 特征选择

## 1. 去除方差小的特征

In [1]:
from sklearn.feature_selection import VarianceThreshold

# 6个样本，3维的特征向量
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]

# 根据方差保留80%的向量
# 计算公式：var_thresh = p(1-p)
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

## 2. 基于单变量统计特征选择

In [5]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
iris = load_iris()
X, y = iris.data, iris.target
print('原始特征：')
print(X.shape)
print(X[:5, :])

print()

# 使用卡方分布选择2个维度的变量
X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
print('选取的特征：')
print(X_new.shape)
print(X_new[:5, :])

原始特征：
(150, 4)
[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]]

选取的特征：
(150, 2)
[[ 1.4  0.2]
 [ 1.4  0.2]
 [ 1.3  0.2]
 [ 1.5  0.2]
 [ 1.4  0.2]]


## 3. 基于模型的特征选择

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel

iris = load_iris()
X, y = iris.data, iris.target
print('原始特征：')
print(X.shape)
print(X[:5, :])

clf = RandomForestClassifier()
clf = clf.fit(X, y)
print('特征得分：')
print(clf.feature_importances_  )

# 基于随机森林选择特征
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
print('选取的特征：')
print(X_new.shape)
print(X_new[:5, :])

原始特征：
(150, 4)
[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]]
特征得分：
[ 0.13665984  0.02527954  0.52110549  0.31695513]
选取的特征：
(150, 2)
[[ 1.4  0.2]
 [ 1.4  0.2]
 [ 1.3  0.2]
 [ 1.5  0.2]
 [ 1.4  0.2]]
