# MNIST datasets

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml  # 从官方网站中下载数据

In [2]:
mnist = fetch_openml("mnist_784")



In [4]:
mnist.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])

In [16]:
# get data
X = mnist.data
Y = mnist.target

In [17]:
# 数据转换
x_train = np.array(X[:60000], dtype=float)
y_train = np.array(Y[:60000], dtype=float)
x_test = np.array(X[60000:], dtype=float)
y_test = np.array(Y[60000:], dtype=float)

## 使用KNN算法进行分类识别

In [21]:
from sklearn.neighbors import KNeighborsClassifier

In [24]:
knn = KNeighborsClassifier(n_jobs=-1)
%time knn.fit(x_train, y_train)

Wall time: 19.2 s


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')

In [26]:
%time knn_score = knn.score(x_test, y_test)
knn_score

Wall time: 2min 29s


0.9688

## 使用PCA进行降维处理

In [27]:
from sklearn.decomposition import PCA

In [28]:
pca = PCA(n_components=0.9)
%time pca.fit(x_train)

Wall time: 4.87 s


PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [29]:
x_train_de = pca.transform(x_train)
x_test_de = pca.transform(x_test)

In [30]:
knn_pca = KNeighborsClassifier(n_jobs=-1)
%time knn_pca.fit(x_train_de, y_train)

Wall time: 368 ms


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')

In [32]:
%time score_pca = knn_pca.score(x_test_de, y_test)
score_pca 

Wall time: 16.5 s


0.9728

## 总结：
* 1、使用PCA进行降维处理之后，score改变的很小，但计算速度提升很多倍
* 2、PCA处理之后可能会使score得分上升，因为PCA具有降噪的功能