# Working with the MNIST dataset
http://yann.lecun.com/exdb/mnist/

Load the dataset

In [8]:
from keras.datasets import mnist
import tensorflow
import numpy as np
from sklearn import preprocessing

(train_X, train_y), (test_X, test_y) = mnist.load_data()
print(f'train_X: {train_X.shape} → type: {type(train_X)}')
print(f'train_y: {train_y.shape} → type: {type(train_y)}')
print(f'test_X: {test_X.shape} → type: {type(test_X)}')
print(f'test_y: {test_y.shape} → type: {type(test_y)}')

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
train_X: (60000, 28, 28) → type: <class 'numpy.ndarray'>
train_y: (60000,) → type: <class 'numpy.ndarray'>
test_X: (10000, 28, 28) → type: <class 'numpy.ndarray'>
test_y: (10000,) → type: <class 'numpy.ndarray'>


Flatten each array to a 28x28 = 784 dimensional vector

In [9]:
train_X = train_X.reshape(train_X.shape[0], -1)
test_X = test_X.reshape(test_X.shape[0], -1)

print(f'train_x: {train_X.shape}')
print(f'test_x: {test_X.shape}')

train_x: (60000, 784)
test_x: (10000, 784)


Rescale images from the range of [0,255] to the range of [0.0,1.0] 

In [15]:
from sklearn import preprocessing
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

train_X_scale = minmax_scale.fit_transform(train_X)

test_x_scale = minmax_scale.fit_transform(test_X)

print(f'train_x_scale: {train_X_scale}')
print(f'test_x_scale: {test_x_scale}')

train_x_scale: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
test_x_scale: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


Plot first 9 entries in the dataset

In [35]:
import matplotlib.pyplot as plt




for i in range(3):
    for j in range(3):
        plt.axes[i, j].imshow(train_X.reshape(28, 28))
plt.show()

# plt.plot(test_x_scale[:10])
# plt.show()

TypeError: 'function' object is not subscriptable

Convert to Pandas dataframes

In [22]:
import pandas as pd

# data
df_train = pd.DataFrame(train_X_scale)
df_test = pd.DataFrame(test_x_scale)

y_train = pd.DataFrame(train_y)
y_test = pd.DataFrame(test_y)

## PCA

In [40]:
from sklearn.decomposition import PCA


pca = PCA(n_components=784)

pca_train = pca.fit_transform(train_X_scale)

pca.explained_variance_ratio_

array([9.70429203e-02, 7.09565194e-02, 6.16885230e-02, 5.38921329e-02,
       4.86861032e-02, 4.31206600e-02, 3.27180453e-02, 2.88378525e-02,
       2.76192360e-02, 2.35691020e-02, 2.10910875e-02, 2.02291363e-02,
       1.71575259e-02, 1.69204675e-02, 1.57858079e-02, 1.48289613e-02,
       1.32451022e-02, 1.27684890e-02, 1.18721751e-02, 1.15263966e-02,
       1.06612548e-02, 1.00667499e-02, 9.53537139e-03, 9.12508969e-03,
       8.83370961e-03, 8.39287108e-03, 8.12547818e-03, 7.86336079e-03,
       7.44704697e-03, 6.90832834e-03, 6.58068792e-03, 6.48123181e-03,
       6.02592027e-03, 5.86559959e-03, 5.69999705e-03, 5.43607094e-03,
       5.05767086e-03, 4.87840166e-03, 4.81411135e-03, 4.72248386e-03,
       4.56729640e-03, 4.44819125e-03, 4.18485442e-03, 3.98200212e-03,
       3.84959984e-03, 3.75089082e-03, 3.61995122e-03, 3.51578010e-03,
       3.40045393e-03, 3.21862442e-03, 3.19005004e-03, 3.12793074e-03,
       2.95971452e-03, 2.88943728e-03, 2.84119693e-03, 2.71425678e-03,
      