# Assignment 10: Dimensionality Reduction

Dataset(s) needed: MNIST ("Modified National Institute of Standards and Technology") dataset.

In [1]:
#Load the MNIST dataset
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')

#### Q.1. Split the data into a training set and a test set (take the first 60,000 instances for training, and the remaining 10,000 for testing).

In [2]:
import pandas as pd

X = pd.DataFrame(mnist.data)
y = pd.Series(mnist.target).astype(int)

num = 60000

X_train, y_train, X_test, y_test = X.iloc[:num], y.iloc[:num], X.iloc[num:], y.iloc[num:]

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(60000, 784) (60000,) (10000, 784) (10000,)


#### Q.2. Train a Logistic Regression classifier on the dataset and see how long it takes.

In [3]:
from sklearn.linear_model import LogisticRegression
import time

log_clf = LogisticRegression(solver="lbfgs", multi_class="multinomial")
start_time = time.time()

# Train the classifier
lr_model = log_clf.fit(X_train, y_train)

end_time = time.time()

duration_full = end_time - start_time

print("Training took {:.2f}s".format(duration_full))

Training took 17.48s


#### Q.3. Evaluate the resulting model on the test set.

In [4]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 

y_test_pred_lr = lr_model.predict(X_test)

print("Performance on the test set:")
print("\n Confusion matrix:\n", confusion_matrix(y_test, y_test_pred_lr))  
print("\n Classification report:\n", classification_report(y_test, y_test_pred_lr))
print("\n Accuracy score = %.3f" % (accuracy_score(y_test, y_test_pred_lr)))

accuracy_full = accuracy_score(y_test, y_test_pred_lr)

Performance on the test set:

 Confusion matrix:
 [[ 963    0    0    3    1    3    4    4    2    0]
 [   0 1112    4    2    0    1    3    2   11    0]
 [   3   10  926   15    6    4   15    8   42    3]
 [   4    1   21  916    1   26    3    9   22    7]
 [   1    1    7    3  910    0    9    7   10   34]
 [  11    2    1   33   11  776   11    6   35    6]
 [   9    3    7    3    7   16  910    2    1    0]
 [   1    6   24    5    7    1    0  951    3   30]
 [   8    7    6   23    6   26   10   10  869    9]
 [   9    7    0   11   25    6    0   22    7  922]]

 Classification report:
              precision    recall  f1-score   support

          0       0.95      0.98      0.97       980
          1       0.97      0.98      0.97      1135
          2       0.93      0.90      0.91      1032
          3       0.90      0.91      0.91      1010
          4       0.93      0.93      0.93       982
          5       0.90      0.87      0.89       892
          6       0.9

#### Q.4. Use PCA to reduce the dataset's dimensionality, with an explained variance ratio of 95%.

In [5]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, svd_solver='full')

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [6]:
print("Number of components chosen = %d, Total explained variance ratio = %.3f" % (pca.n_components_, 
                                                                                   pca.explained_variance_ratio_.cumsum()[-1]))

Number of components chosen = 154, Total explained variance ratio = 0.950


#### Q.5. Train a new Logistic Regression classifier on the reduced dataset and see how long it takes. Was training much faster? Explain your results

In [7]:
log_pca = LogisticRegression(solver="lbfgs", multi_class="multinomial")
start_time = time.time()

# Train the classifier
lr_pca_model = log_pca.fit(X_train_pca, y_train)

end_time = time.time()

duration_pca = end_time - start_time

print("Training took {:.2f}s".format(duration_pca))

Training took 7.30s


#### Q.6. Evaluate the new classifier on the test set: how does it compare to the previous classifier? Discuss the speed / accuracy trade-off and in which case you'd prefer a very slight drop in model performance for a x-time speedup in training.

In [8]:
y_test_pca_pred_lr = lr_pca_model.predict(X_test_pca)

print("Performance on the test set:")
print("\n Confusion matrix:\n", confusion_matrix(y_test, y_test_pca_pred_lr))  
print("\n Classification report:\n", classification_report(y_test, y_test_pca_pred_lr))
print("\n Accuracy score = %.3f" % (accuracy_score(y_test, y_test_pca_pred_lr)))

accuracy_pca = accuracy_score(y_test, y_test_pca_pred_lr)

Performance on the test set:

 Confusion matrix:
 [[ 959    0    1    1    2    2    7    6    1    1]
 [   0 1108    2    2    1    2    4    2   14    0]
 [  10   11  933   11    8    4   12    8   27    8]
 [   4    1   24  910    3   25    2   12   18   11]
 [   2    3    4    2  916    0    9    2   10   34]
 [   9    4    4   32   11  765   15   10   34    8]
 [   9    6    8    0    9   12  909    3    2    0]
 [   3    9   19    5    8    1    0  951    0   32]
 [   8   14   10   26   11   24   14   15  843    9]
 [  12    6    1   11   36    5    1   24    6  907]]

 Classification report:
              precision    recall  f1-score   support

          0       0.94      0.98      0.96       980
          1       0.95      0.98      0.96      1135
          2       0.93      0.90      0.92      1032
          3       0.91      0.90      0.91      1010
          4       0.91      0.93      0.92       982
          5       0.91      0.86      0.88       892
          6       0.9

In [9]:
print("Training time: full logistic regression / PCA logistic regression = %.2fs / %.2fs = %.1f" % (duration_full, duration_pca, 
                                                                                                    duration_full/duration_pca))

print("Accuracy: full logistic regression / PCA logistic regression = %.4f / %.4f = %.3f" % (accuracy_full, accuracy_pca, 
                                                                                             accuracy_full/accuracy_pca))

Training time: full logistic regression / PCA logistic regression = 17.48s / 7.30s = 2.4
Accuracy: full logistic regression / PCA logistic regression = 0.9255 / 0.9201 = 1.006


Using the full logistic regression model with 784 features results in less than a 1% increase in accuracy over the PCA logistic regression model with 154 features, but it takes 2.4 times longer to train. That increase in accuracy is completely negligible when compared to the training time increase. The training time for the full model was not so significant in this case, but as the dataset increases and models become more complex, this improvement in training time could save hours or days.