In [1]:
import numpy as np
from mnist import MNIST

In [2]:
mnist_loader = MNIST("../data/mnist/")
mnist_loader.gz = True

In [3]:
X, y = mnist_loader.load_training()
_X, _y = mnist_loader.load_testing()

In [4]:
X = np.array(X, dtype='float32')
y = np.array(y, dtype='int16')
_X = np.array(_X, dtype='float32')
_y = np.array(_y, dtype='int16')

### Standardize

In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
scalar = StandardScaler()

In [6]:
X_scaled = scalar.fit_transform(X)
_X_scaled = scalar.transform(_X)

### Transform

In [7]:
sigma = np.cov(X_scaled.T)

In [8]:
sigma.shape

(784, 784)

In [9]:
eig_vals, eig_vectors = np.linalg.eig(sigma)

In [10]:
eig_vals_sorted = sorted(eig_vals, reverse=True)
top_5 = eig_vals_sorted[:5]
top_5_args = np.argsort(eig_vals)[::-1][:5]
top_20 = eig_vals_sorted[:20]
top_20_args = np.argsort(eig_vals)[::-1][:20]

In [24]:
X_transformed_5 = np.dot(X_scaled, eig_vectors[:, top_5_args])
_X_transformed_5 = np.dot(_X_scaled, eig_vectors[:, top_5_args])
X_transformed_20 = np.dot(X_scaled, eig_vectors[:, top_20_args])
_X_transformed_20 = np.dot(_X_scaled, eig_vectors[:, top_20_args])

In [25]:
X_scaled.shape

(60000, 784)

In [26]:
X_transformed_20.shape

(60000, 20)

### Predict

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

#### LR

In [28]:
mnist_lr = LogisticRegression(penalty='l2')

In [29]:
%%time
mnist_lr.fit(X_transformed_20, y)

CPU times: user 12.5 s, sys: 9.85 ms, total: 12.5 s
Wall time: 12.7 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [31]:
yhat = mnist_lr.predict(X_transformed_20)
print('20 - Train acc:', sum(yhat == y)/len(y))
_yhat = mnist_lr.predict(_X_transformed_20)
print('20 - Test acc:', sum(_yhat == _y)/len(_y))

20 - Train acc: 0.84965
20 - Test acc: 0.8582


In [32]:
%%time
mnist_lr.fit(X_transformed_5, y)

CPU times: user 2.78 s, sys: 72 µs, total: 2.78 s
Wall time: 2.8 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [33]:
yhat = mnist_lr.predict(X_transformed_5)
print('20 - Train acc:', sum(yhat == y)/len(y))
_yhat = mnist_lr.predict(_X_transformed_5)
print('20 - Test acc:', sum(_yhat == _y)/len(_y))

20 - Train acc: 0.6415666666666666
20 - Test acc: 0.6459


#### DT

In [34]:
mnist_tree = DecisionTreeClassifier(random_state=666)

In [35]:
%%time
# 20 features
mnist_tree.fit(X_transformed_20, y)

CPU times: user 3.22 s, sys: 0 ns, total: 3.22 s
Wall time: 3.23 s


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=666,
            splitter='best')

In [36]:
yhat = mnist_tree.predict(X_transformed_20)
print('20 - Train acc:', sum(yhat == y)/len(y))
_yhat = mnist_tree.predict(_X_transformed_20)
print('20 - Test acc:', sum(_yhat == _y)/len(_y))

20 - Train acc: 1.0
20 - Test acc: 0.8477


In [37]:
%%time
# 5 features
mnist_tree.fit(X_transformed_5, y)

CPU times: user 728 ms, sys: 1.98 ms, total: 730 ms
Wall time: 736 ms


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=666,
            splitter='best')