In [1]:
import numpy as np
from mnist import MNIST

In [2]:
mnist_loader = MNIST("../data/mnist/")
mnist_loader.gz = True

In [3]:
X, y = mnist_loader.load_training()
_X, _y = mnist_loader.load_testing()

In [4]:
X = np.array(X, dtype='float32')
y = np.array(y, dtype='int16')
_X = np.array(_X, dtype='float32')
_y = np.array(_y, dtype='int16')

### Standardize

In [5]:
from sklearn.preprocessing import StandardScaler

In [6]:
scalar = StandardScaler()

In [7]:
X_scaled = scalar.fit_transform(X)
_X_scaled = scalar.transform(_X)

### Transform

In [8]:
sigma = np.cov(X_scaled.T)

In [9]:
sigma.shape

(784, 784)

In [10]:
eig_vals, eig_vectors = np.linalg.eig(sigma)

In [11]:
eig_vals_sorted = sorted(eig_vals, reverse=True)
top_5 = eig_vals_sorted[:5]
top_5_args = np.argsort(eig_vals)[::-1][:5]
top_20 = eig_vals_sorted[:20]
top_20_args = np.argsort(eig_vals)[::-1][:20]

In [12]:
X_transformed_5 = np.dot(X_scaled, eig_vectors[:, top_5_args])
_X_transformed_5 = np.dot(_X_scaled, eig_vectors[:, top_5_args])
X_transformed_20 = np.dot(X_scaled, eig_vectors[:, top_20_args])
_X_transformed_20 = np.dot(_X_scaled, eig_vectors[:, top_20_args])

In [13]:
X_scaled.shape

(60000, 784)

In [14]:
X_transformed_20.shape

(60000, 20)

### Predict

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

#### LR

In [16]:
mnist_lr = LogisticRegression(penalty='l2')

In [17]:
%%time
mnist_lr.fit(X_transformed_20, y)

CPU times: user 12 s, sys: 12.3 ms, total: 12 s
Wall time: 12.1 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
yhat = mnist_lr.predict(X_transformed_20)
print('20 - Train acc:', sum(yhat == y)/len(y))
_yhat = mnist_lr.predict(_X_transformed_20)
print('20 - Test acc:', sum(_yhat == _y)/len(_y))

20 - Train acc: 0.8496666666666667
20 - Test acc: 0.8582


In [19]:
%%time
mnist_lr.fit(X_transformed_5, y)

CPU times: user 2.9 s, sys: 1.86 ms, total: 2.9 s
Wall time: 2.93 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
yhat = mnist_lr.predict(X_transformed_5)
print('5 - Train acc:', sum(yhat == y)/len(y))
_yhat = mnist_lr.predict(_X_transformed_5)
print('5 - Test acc:', sum(_yhat == _y)/len(_y))

5 - Train acc: 0.6415666666666666
5 - Test acc: 0.6459


#### DT

In [21]:
mnist_tree = DecisionTreeClassifier(random_state=666)

In [22]:
%%time
# 20 features
mnist_tree.fit(X_transformed_20, y)

CPU times: user 3.24 s, sys: 1.02 ms, total: 3.24 s
Wall time: 3.25 s


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=666,
            splitter='best')

In [23]:
yhat = mnist_tree.predict(X_transformed_20)
print('20 - Train acc:', sum(yhat == y)/len(y))
_yhat = mnist_tree.predict(_X_transformed_20)
print('20 - Test acc:', sum(_yhat == _y)/len(_y))

20 - Train acc: 1.0
20 - Test acc: 0.8476


In [24]:
%%time
# 5 features
mnist_tree.fit(X_transformed_5, y)

CPU times: user 677 ms, sys: 975 µs, total: 678 ms
Wall time: 683 ms


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=666,
            splitter='best')

In [25]:
yhat = mnist_tree.predict(X_transformed_5)
print('5 - Train acc:', sum(yhat == y)/len(y))
_yhat = mnist_tree.predict(_X_transformed_5)
print('5 - Test acc:', sum(_yhat == _y)/len(_y))

5 - Train acc: 1.0
5 - Test acc: 0.669
