# 特征选择

## Filter - 过滤法

### 数据读取

In [None]:
import pandas as pd

raw_df = pd.read_csv("data-test-1.csv", index_col=0)  #读取数据。index_col=0：读取时不自动添加行号。
raw_df.head()

In [1]:
raw_data = raw_df.drop(["rating"], axis=1)  #删除指定标签列
labels = raw_df["rating"]  #标签

### 无量纲化

In [ ]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler(copy=True)
data = pd.DataFrame(sc_X.fit_transform(raw_data))
data

### 方差选择法

In [ ]:

from sklearn.feature_selection import VarianceThreshold

#这里我们使用原始数据
data_after_var = VarianceThreshold(threshold=0.01).fit_transform(raw_data, labels)  #使用阈值 0.01 进行选择
data_after_var.shape

In [ ]:
data_after_var = pd.concat([pd.DataFrame(data_after_var), labels], axis=1)
data_after_var.to_csv("data_after_var")

### 皮尔森关系系数

In [2]:
data_numerical = raw_data[["suspectedCount", "curedCount", "deadCount"]]
data_numerical

NameError: name 'raw_data' is not defined

In [ ]:
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr

print(pearsonr(data_numerical["suspectedCount"], labels))
print(pearsonr(data_numerical["curedCount"], labels))
print(pearsonr(data_numerical["deadCount"], labels))

In [3]:
import numpy as np

data_numerical = SelectKBest(lambda X, Y: np.array(list(map(lambda
                                                                x: pearsonr(x, Y), X.T))).T[0], k=2).fit_transform(
    data_numerical, labels)
data_numerical = pd.DataFrame(data_numerical, columns={"feature1", "feature2"})
data_numerical

NameError: name 'SelectKBest' is not defined

# 降维

## PCA实现

In [ ]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.85)
data_after_pca = pd.DataFrame(pca.fit_transform(data))
data_after_pca

## 可视化

In [ ]:
scaled_labels = pd.DataFrame(labels.astype("int"), columns={"rating"})
data_after_pca_withlabels = pd.concat([data_after_pca, labels], axis=1)
data_after_pca_withlabels.to_csv("after_pca.csv")
data_after_pca_withlabels

In [ ]:
import matplotlib.pyplot as plt


def draw_graph(X):
    for i in range(2):
        plt.scatter(data_after_pca_withlabels.loc[data_after_pca_withlabels.rating == i, 0],
                    data_after_pca_withlabels.loc[data_after_pca_withlabels.rating == i, 1], alpha=0.8,
                    label='%s' % i)
    plt.legend()
    plt.show()


draw_graph(data)