## Import and Load

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier

In [4]:
# load the dataset
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784')
# view the shape of the dataset
mnist.data.shape

(70000, 784)

## Scale Data and PCA

In [5]:
# Save X data
X = mnist.data

y = mnist.target


## Split Data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [7]:
# Create a transformer pipeline
transformer = make_pipeline(StandardScaler(), PCA(n_components=.95))

## KNN with PCA

In [8]:
# Create instance of KNN
knn1 = KNeighborsClassifier()

#Model Pipeline

knn_pipe = make_pipeline(transformer, knn1)
knn_pipe.fit(X_train, y_train)

In [20]:
# Save predictions for train dataset
knn_train_preds = knn_pipe.predict(X_train)

In [17]:
%%time
# Save test predictions
knn_test_preds_pca = knn_pipe.predict(X_test)

CPU times: user 18.1 s, sys: 595 ms, total: 18.7 s
Wall time: 2.67 s


In [10]:
print('Training accuracy:', knn_pipe.score(X_train, y_train))
print('Testing accuracy:', knn_pipe.score(X_test, y_test))

Training accuracy: 0.9650285714285715
Testing accuracy: 0.9477714285714286


## KNN no PCA

In [11]:
transformer2 = make_pipeline(StandardScaler())

In [12]:
knn2 = KNeighborsClassifier()

# Model Pipeline

knn_pipe2 = make_pipeline(transformer2, knn2)
knn_pipe2.fit(X_train,y_train)

In [18]:
# Save predictions for train dataset
knn_train_preds2 = knn_pipe2.predict(X_train)

In [19]:
%%time
# Save test predictions
knn_test_preds2_no_pca = knn_pipe2.predict(X_test)

CPU times: user 39.4 s, sys: 571 ms, total: 39.9 s
Wall time: 5.62 s


In [15]:
print('Training accuracy:', knn_pipe2.score(X_train, y_train))
print('Testing accuracy:', knn_pipe2.score(X_test, y_test))

Training accuracy: 0.9625904761904762
Testing accuracy: 0.9442285714285714


Questions

1. Which model performed best on the test set? They were pretty equal but technically by the third decimal place, the one without the PCA performed better

2. Which model was fastest at making predictions? The model with PCA in 18.7s compared to 39.9s
