## Import data set and split data

In [2]:
from sklearn.datasets import fetch_mldata

mnist = fetch_mldata("MNIST original")

In [3]:
mnist

{'COL_NAMES': ['label', 'data'],
 'DESCR': 'mldata.org dataset: mnist-original',
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'target': array([ 0.,  0.,  0., ...,  9.,  9.,  9.])}

In [4]:
X, y = mnist.data, mnist.target

In [5]:
X_train, y_train, X_test, y_test = X[:60000], y[:60000], X[60000:], y[60000:]

## Train ML model

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
rf_clf = RandomForestClassifier()

In [9]:
import time

rf_time = 0
for i in range(10):
    t1 = time.clock()
    rf_clf.fit(X_train, y_train)
    t2 = time.clock()
    rf_time += (t2-t1)/10
rf_time

4.371429299999999

## Now let's reduce the dimension and compare time

In [10]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)

In [11]:
pca_time = 0
rf_time = 0
pcaNrf_time = 0
for i in range(10):
    t1 = time.clock()
    X_train_pca = pca.fit_transform(X_train)
    t2 = time.clock()
    rf_clf.fit(X_train_pca, y_train)
    t3 = time.clock()
    rf_time += (t3-t2)/10
    pca_time += (t2-t1)/10
    pcaNrf_time += (t3-t1)/10

for i in ("rf_time", "pca_time", "pcaNrf_time"):
    print("{}: {}".format(i, eval(i)))

rf_time: 9.6189991
pca_time: 18.092441000000004
pcaNrf_time: 27.711440100000008


## Check the performance difference between those two models

In [16]:
from sklearn.model_selection import cross_val_score

print(cross_val_score(rf_clf, X_train, y_train, scoring="accuracy", verbose=1))
print(cross_val_score(rf_clf, X_train_pca, y_train, scoring="accuracy", verbose=1))

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    8.6s finished


[ 0.93986203  0.93779689  0.94194129]
[ 0.88037393  0.87674384  0.88458269]


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   19.0s finished


In [19]:
from sklearn.metrics import accuracy_score

rf_clf.fit(X_train, y_train)
accuracy_score(y_test, rf_clf.predict(X_test))

0.94979999999999998

In [24]:
pca.fit(X_train)
rf_clf.fit(X_train_pca, y_train)
accuracy_score(y_test, rf_clf.predict(pca.transform(X_test)))

0.89359999999999995

Well fuck PCA, slower and less accurate, blyat!

## Let's now use logistic regression to try it.

In [28]:
from sklearn.linear_model import LogisticRegression

In [30]:
lr_clf = LogisticRegression(multi_class="multinomial", solver="lbfgs", random_state=42)

In [35]:
t0 = time.clock()
lr_clf.fit(X_train, y_train)
print("time of lr_clf is: {}".format(time.clock()-t0))
accuracy_score(y_test, lr_clf.predict(X_test))

time of lr_clf is: 65.89867200000003


0.92520000000000002

In [36]:
t0 = time.clock()
lr_clf.fit(X_train_pca, y_train)
print("time of lr_clf is: {}".format(time.clock()-t0))
accuracy_score(y_test, lr_clf.predict(pca.transform(X_test)))

time of lr_clf is: 13.96863099999996


0.91979999999999995

#### So over all the pca metho some times gives much better time and little drop in performance. While in some other cases, it can both drasticlly slowdown the training as well as drag down the accuracy