In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.pipeline import make_pipeline

In [None]:
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

In [None]:
%matplotlib inline

In [None]:
features, target = load_wine(return_X_y=True) # 三分类的酒数据集
features.shape, target.shape

In [None]:
RANDOM_STATE = 42
# 将数据切分成7：3分别作为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(features, target,
                                                    test_size=0.30,
                                                    random_state=RANDOM_STATE)

In [None]:
# 不使用PCA
raw_clf = make_pipeline(StandardScaler(), LogisticRegression())
raw_clf.fit(X_train, y_train)
pred_test_raw = raw_clf.predict(X_test)

In [None]:
# 使用PCA但不做数据预处理
unscaled_clf = make_pipeline(PCA(n_components=2), LogisticRegression())
unscaled_clf.fit(X_train, y_train)
pred_test = unscaled_clf.predict(X_test)

In [None]:
# 使用PCA，同时做数据预处理
std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), LogisticRegression())
std_clf.fit(X_train, y_train)
pred_test_std = std_clf.predict(X_test)

In [None]:
# 查看各情况下的分类准确率
print u'\n不使用PCA，预测准确率:', '{:.2%}'.format(metrics.accuracy_score(y_test, pred_test_raw))
print u'\n使用PCA但不预处理，预测准确率:', '{:.2%}'.format(metrics.accuracy_score(y_test, pred_test))
print u'\n使用PCA且预处理，预测准确率:', '{:.2%}'.format(metrics.accuracy_score(y_test, pred_test_std))

In [None]:
# 将pca信息抽取出来
pca = unscaled_clf.named_steps['pca']
pca_std = std_clf.named_steps['pca']

In [None]:
# 打印最主要的主成分。注意，它是特征空间中的主成分轴，表达了数据中具有最大方差的方向
print u'\n未预处理第一主成分:\n', pca.components_[0]
print u'\n预处理第一主成分:\n', pca_std.components_[0]

In [None]:
# 对训练数据进行PCA降维以备绘图
X_train_nostd = pca.transform(X_train)
scaler = std_clf.named_steps['standardscaler']
X_train_std = pca_std.transform(scaler.transform(X_train))

FIG_SIZE = (10, 7)

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE)

# 不预处理的PCA
for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):
    ax1.scatter(X_train_nostd[y_train == l, 0], X_train_nostd[y_train == l, 1],
                color=c,
                label='class %s' % l,
                alpha=0.5,
                marker=m
                )

# 预处理后的PCA
for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):
    ax2.scatter(X_train_std[y_train == l, 0], X_train_std[y_train == l, 1],
                color=c,
                label='class %s' % l,
                alpha=0.5,
                marker=m
                )

ax1.set_title(u'PCA降维后的训练集')
ax2.set_title(u'特征放缩+PCA降维后的训练集')

for ax in (ax1, ax2):
    ax.set_xlabel(u'第一主成分')
    ax.set_ylabel(u'第二主成分')
    ax.legend(loc='upper right')
    ax.grid()

plt.tight_layout()
