In [1]:
import numpy as np
import scipy.io as io
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# 加载mnist手写数字数据集
mnist = io.loadmat("./mnist/mnist-original.mat")

In [3]:
mnist.keys()

dict_keys(['__header__', '__version__', '__globals__', 'mldata_descr_ordering', 'data', 'label'])

In [4]:
mnist['data'].shape

(784, 70000)

In [5]:
mnist['label'].shape

(1, 70000)

In [6]:
X = mnist['data']
X = X.T
y = mnist['label']
y = y.T

In [7]:
X.shape

(70000, 784)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [9]:
X_train.shape

(56000, 784)

In [10]:
y_train.shape

(56000, 1)

In [11]:
X_test.shape

(14000, 784)

In [12]:
y_test.shape

(14000, 1)

## 使用KNN

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
%time knn_clf.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


CPU times: user 16 s, sys: 95.7 ms, total: 16.1 s
Wall time: 16 s


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [14]:
%time knn_clf.score(X_test, y_test)

CPU times: user 17min 49s, sys: 55.1 ms, total: 17min 49s
Wall time: 17min 49s


0.9716428571428571

## 使用PCA降维

In [15]:
from sklearn.decomposition import PCA
pca = PCA(0.95)

In [16]:
pca.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [17]:
X_train_reduction = pca.transform(X_train)
X_test_reduction = pca.transform(X_test)

In [18]:
X_train_reduction.shape

(56000, 154)

In [19]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
%time knn_clf.fit(X_train_reduction, y_train)

  """Entry point for launching an IPython kernel.


CPU times: user 1.22 s, sys: 4.04 ms, total: 1.23 s
Wall time: 1.23 s


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [20]:
%time knn_clf.score(X_test_reduction, y_test)

CPU times: user 2min 59s, sys: 19.9 ms, total: 2min 59s
Wall time: 2min 59s


0.9730714285714286

In [23]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()
%time rf_clf.fit(X_train_reduction, y_train)

  """Entry point for launching an IPython kernel.


CPU times: user 1min 24s, sys: 56 ms, total: 1min 24s
Wall time: 1min 24s


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [24]:
%time rf_clf.score(X_test_reduction, y_test)

CPU times: user 473 ms, sys: 8 ms, total: 481 ms
Wall time: 483 ms


0.9490714285714286