<a href="https://colab.research.google.com/github/ColeBromfield01/bromfield-portfolio/blob/main/DATA604_Final_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Code**

*Loading data*

In [2]:
import tensorflow as tf
(images1, labels1), (images2, labels2) = tf.keras.datasets.fashion_mnist.load_data()

images = tf.concat([images1, images2], axis=0)
labels = tf.concat([labels1, labels2], axis=0)

*Defining kNN function*

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def knn(train, test, train_labels, test_labels, k):
  train_flat = train.reshape(train.shape[0], -1)
  test_flat = test.reshape(test.shape[0], -1)

  knn = KNeighborsClassifier(n_neighbors=k)

  # Fit classifier to training data
  knn.fit(train_flat, train_labels)

  # Use the model to predict labels for test data
  predicted_labels = knn.predict(test_flat)

  # Calculate the accuracy of the predicted labels for the test data
  accuracy = accuracy_score(test_labels, predicted_labels)

  # Generate a confusion matrix
  cm = confusion_matrix(test_labels, predicted_labels)

  return(accuracy, cm)

*Problem 1*

In [4]:
from sklearn.model_selection import train_test_split
import numpy as np

# Creating 80-20 train-test split
test_prop = 0.2
train_images, test_images, train_labels, test_labels = (
    train_test_split(np.array(images), np.array(labels), test_size=test_prop))

# Running kNN on raw data with k = 20
neighbors = 20
accuracy, cm = knn(train_images, test_images, train_labels, test_labels,
                   neighbors)

print(f"Accuracy: {accuracy}")

Accuracy: 0.8484285714285714


In [5]:
from tabulate import tabulate

class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal',
               'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

# Dictionary to story accuracy by class
class_accuracies = {}

for i in range(10):
  correct = cm[i, i]
  total = sum(cm[i])

  accuracy = 100 * correct / total

  # Storing class accuracy in dictionary
  class_accuracies[class_names[i]] = round(accuracy, 2)

# Turning the dictionary into a table
table = [[key, value] for key, value in class_accuracies.items()]

# Print the accuracies for each class in a Markdown formatted table
print(tabulate(table, headers=["Class", "Accuracy %"], tablefmt='pipe'))

| Class       |   Accuracy % |
|:------------|-------------:|
| T-shirt/top |        88.58 |
| Trouser     |        95.81 |
| Pullover    |        79.44 |
| Dress       |        87.39 |
| Coat        |        77.28 |
| Sandal      |        79.94 |
| Shirt       |        55.27 |
| Sneaker     |        94.53 |
| Bag         |        94.02 |
| Ankle boot  |        96.06 |


*Problem 2*

Defining PCA matrix

In [6]:
from sklearn.decomposition import PCA

train_images = tf.cast(train_images, tf.float32)

# Shaping data into vectors
train_images = tf.reshape(train_images, (-1, 28 * 28))
test_images = tf.reshape(test_images, (-1, 28 * 28))

# Fitting a PCA model to the training data
pca = PCA()
pca.fit(train_images)
P = pca.components_

Finding lowest dimension at which >84% accuracy is maintained

In [7]:
acc = 0
keep = -5
while acc < 0.84:
  keep += 10
  p_reduced = P[:keep]
  p_reduced_train_images = np.transpose(np.dot(p_reduced, np.transpose(train_images)))
  p_reduced_test_images = np.dot(p_reduced, np.transpose(test_images)).T
  acc, cm_r = knn(p_reduced_train_images, p_reduced_test_images, train_labels, test_labels, 20)

print(f"Dimensions kept: {keep}")
print(f"Accuracy: {acc}")

Dimensions kept: 25
Accuracy: 0.8547142857142858


In [8]:
# Repeating same class-by-class accuracy used with raw data
class_accuracies_pca = {}
for i in range(10):
  correct = cm_r[i, i]
  total = sum(cm_r[i])

  accuracy = 100 * correct / total

  class_accuracies_pca[class_names[i]] = round(accuracy, 2)

# Turning the dictionary into a table
table_pca = [[key, value] for key, value in class_accuracies_pca.items()]

# Print the accuracies for each class in a Markdown formatted table
print(tabulate(table_pca, headers=["Class", "Accuracy %"], tablefmt='pipe'))

| Class       |   Accuracy % |
|:------------|-------------:|
| T-shirt/top |        86.63 |
| Trouser     |        95.95 |
| Pullover    |        77.29 |
| Dress       |        88.8  |
| Coat        |        80.03 |
| Sandal      |        88.42 |
| Shirt       |        55.41 |
| Sneaker     |        92.54 |
| Bag         |        95.24 |
| Ankle boot  |        94.58 |


*Problem 3*

In [9]:
# Reshaping full dataset into vectors
images = tf.reshape(images, (-1, 28 * 28))

In [10]:
# Sampling the data to avoid RAM issues
data_sample = 0.2
step_size = int(1 / data_sample)
sampled_labels = []
sampled_images = []

for i in range(len(images)):
    if i % step_size == 0:
        sampled_images.append(images[i])
        sampled_labels.append(labels[i])

# 80-20 train-test split for sampled data
s_train_images, s_test_images, s_train_labels, s_test_labels = (
    train_test_split(np.array(sampled_images), np.array(sampled_labels),
                     test_size=test_prop))

In [11]:
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import MinMaxScaler

# Estimating gamma parameter for RBF, based on data variance
data_variance = np.var(s_train_images)
estimated_gamma = 1.0 / (2 * data_variance)

# Normalizing data, as accuracy was terrible without doing this
scaler = MinMaxScaler()
normalized_train_images = scaler.fit_transform(s_train_images)
normalized_test_images = scaler.transform(s_test_images)

# Fitting kernel PCA model to sampled, normalized data
kpca = KernelPCA(kernel='rbf', gamma=estimated_gamma)
kpca.fit(normalized_train_images)

# Transforming train and test data by kPCA model
train_kpca = kpca.transform(normalized_train_images)
test_kpca = kpca.transform(normalized_test_images)

In [33]:
acc_kpca = 0
keep = -5
diff = 1

# Quasi-Elbow method to determine when accuracy levels off
while np.abs(diff) > 0.005:
  keep += 10
  kp_reduced_train = train_kpca[:, :keep]
  kp_reduced_test = test_kpca[:, :keep]
  prev_acc = acc_kpca
  acc_kpca, cm_k = knn(kp_reduced_train, kp_reduced_test, s_train_labels,
                  s_test_labels, 20)
  diff = acc_kpca - prev_acc
  print(f"With {keep} dimensions, {round(100*acc_kpca, 2)}% accuracy achieved.")

print(f"\nDimensions kept: {keep - 10}")
print(f"Accuracy: {round(100*prev_acc, 2)}%")

With 5 dimensions, 72.25% accuracy achieved.
With 15 dimensions, 79.82% accuracy achieved.
With 25 dimensions, 81.5% accuracy achieved.
With 35 dimensions, 81.5% accuracy achieved.

Dimensions kept: 25
Accuracy: 81.5%


In [34]:
# Class-by class accuracy for kPCA
class_accuracies_kpca = {}
for i in range(10):
  correct = cm_k[i, i]
  total = sum(cm_k[i])

  accuracy = 100 * correct / total

  class_accuracies_kpca[class_names[i]] = round(accuracy, 2)

# Turning the dictionary into a table
table_kpca = [[key, value] for key, value in class_accuracies_kpca.items()]

# Print the accuracies for each class in a Markdown formatted table
print(tabulate(table_kpca, headers=["Class", "Accuracy %"], tablefmt='pipe'))

| Class       |   Accuracy % |
|:------------|-------------:|
| T-shirt/top |        86.01 |
| Trouser     |        91.55 |
| Pullover    |        69.96 |
| Dress       |        83.75 |
| Coat        |        76.43 |
| Sandal      |        81.57 |
| Shirt       |        52.4  |
| Sneaker     |        89.67 |
| Bag         |        91.79 |
| Ankle boot  |        93.01 |


*Problem 4*

In [14]:
from sklearn.manifold import Isomap

# Fitting the sampled data to an isomap model with neighborhood size 5
neighborhood_size = 5
isomap = Isomap(n_neighbors=neighborhood_size)
isomap.fit(s_train_images)

iso_train = isomap.transform(s_train_images)
iso_test = isomap.transform(s_test_images)

In [26]:
acc_iso = 0
keep = 0
diff = 1
# Showing that the accuracy is non-changing with 2+ dimensions
while keep < 10:
  keep += 1
  iso_reduced_train = iso_train[:, :keep]
  iso_reduced_test = iso_test[:, :keep]
  prev_acc = acc_iso
  acc_iso, cm_iso = knn(iso_reduced_train, iso_reduced_test, s_train_labels,
                  s_test_labels, 20)
  diff = acc_iso - prev_acc
  print(f"With {keep} dimensions, {round(100*acc_iso, 2)}% accuracy achieved.")

With 1 dimensions, 34.39% accuracy achieved.
With 2 dimensions, 60.71% accuracy achieved.
With 3 dimensions, 60.71% accuracy achieved.
With 4 dimensions, 60.71% accuracy achieved.
With 5 dimensions, 60.71% accuracy achieved.
With 6 dimensions, 60.71% accuracy achieved.
With 7 dimensions, 60.71% accuracy achieved.
With 8 dimensions, 60.71% accuracy achieved.
With 9 dimensions, 60.71% accuracy achieved.
With 10 dimensions, 60.71% accuracy achieved.


In [27]:
# Class accuracies with Isomap model
class_accuracies_iso = {}
for i in range(10):
  correct = cm_iso[i, i]
  total = sum(cm_iso[i])

  accuracy = 100 * correct / total

  class_accuracies_iso[class_names[i]] = round(accuracy, 2)

# Turning the dictionary into a table
table_iso = [[key, value] for key, value in class_accuracies_iso.items()]

# Print the accuracies for each class in a Markdown formatted table
print(tabulate(table_iso, headers=["Class", "Accuracy %"], tablefmt='pipe'))

| Class       |   Accuracy % |
|:------------|-------------:|
| T-shirt/top |        78.67 |
| Trouser     |        75.7  |
| Pullover    |        49.82 |
| Dress       |        57.6  |
| Coat        |        42.86 |
| Sandal      |        70.59 |
| Shirt       |        22.95 |
| Sneaker     |        66.79 |
| Bag         |        70.36 |
| Ankle boot  |        73.78 |
