# ENGR 891: Programming Assignment #1
# Part B: 
## Pre-Processing
### import packages

In [11]:
import warnings
import time
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from tensorflow import keras
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report

### Load and Reshape Data

In [12]:
## Directly load this dataset using the Keras API:

(X_train_cifar, y_train_cifar), (X_test_cifar, y_test_cifar) = keras.datasets.cifar10.load_data()


X_train_cifar = X_train_cifar.reshape((X_train_cifar.shape[0], 3072))
X_test_cifar = X_test_cifar.reshape((X_test_cifar.shape[0], 3072))


print("\nShape of Training Samples: ", X_train_cifar.shape)
print("Shape of Training Labels: ", y_train_cifar.shape)

print("\nShape of Testing Samples: ", X_test_cifar.shape)
print("Shape of Testing Labels: ", y_test_cifar.shape)
print(X_train_cifar.dtype)
print(y_train_cifar.dtype)
print(X_test_cifar.dtype)
print(y_test_cifar.dtype)


Shape of Training Samples:  (50000, 3072)
Shape of Training Labels:  (50000, 1)

Shape of Testing Samples:  (10000, 3072)
Shape of Testing Labels:  (10000, 1)
uint8
uint8
uint8
uint8


### convert the label data into a 1D array

In [13]:
# convert the label data into a 1D array
y_train_cifar = y_train_cifar.ravel()
y_test_cifar = y_test_cifar.ravel()
print(y_train_cifar.shape)
print(y_test_cifar.shape)

(50000,)
(10000,)


### Standarized the data by using min-max method

In [14]:
#scale the data by dividing 255
X_train_cifar = X_train_cifar/255.0
X_test_cifar = X_test_cifar/255.0
# Display the minimum and maximum values of the scaled data
X_train_cifar.min(), X_train_cifar.max()

(0.0, 1.0)

### Train KNN model and get train accuracy

In [15]:
%%time

knn = KNeighborsClassifier(n_neighbors = 5, p =1, n_jobs= -1)

# Fit the model
knn.fit(X_train_cifar, y_train_cifar)

# Compute accuracy on the training set
y_train_predicted = knn.predict(X_train_cifar)

train_accuracy_knn = np.mean(y_train_predicted == y_train_cifar)
print("\nTraining Accuracy: ", train_accuracy_knn)


Training Accuracy:  0.53512
Wall time: 44min 23s


### training data confusion matrix

In [16]:
%%time
# Training data confusion matrix
confusion_matrix(y_train_cifar, y_train_predicted)

Wall time: 31.5 ms


array([[3682,   23,  238,   43,  190,   22,   61,   35,  680,   26],
       [ 599, 2386,  284,  142,  452,   91,  205,   41,  670,  130],
       [ 490,   19, 3332,  107,  652,  101,  122,   27,  137,   13],
       [ 394,   53,  839, 2311,  549,  321,  302,   44,  169,   18],
       [ 385,   16,  842,  146, 3227,   62,  106,   51,  154,   11],
       [ 365,   34,  862,  542,  656, 2069,  261,   60,  138,   13],
       [ 179,   26,  965,  251, 1044,  177, 2248,   13,   86,   11],
       [ 399,   38,  679,  200, 1089,  159,  232, 2022,  143,   39],
       [ 605,   71,  156,   77,  174,   68,   39,   15, 3763,   32],
       [ 653,  331,  387,  204,  420,   95,  183,  130,  881, 1716]],
      dtype=int64)

### test accuracy, and confusion matrix

In [17]:
%%time

# The accuracy of the model
test_accuracy_knn = knn.score(X_test_cifar, y_test_cifar)
print("\nTest Accuracy: ", test_accuracy_knn)


# No. of Correct Predictions
y_test_predicted = knn.predict(X_test_cifar)
print("\nNo. of correct predictions (Test): %d/%d" % (np.sum(y_test_predicted == y_test_cifar), len(y_test_cifar)))


# Confusion Matrix
print("\nConfusion Matrix (Test Data):\n", confusion_matrix(y_test_cifar, y_test_predicted))


Test Accuracy:  0.377

No. of correct predictions (Test): 3770/10000

Confusion Matrix (Test Data):
 [[582   9 101  10  49   7  25   7 195  15]
 [139 288  89  50 130  40  44  17 168  35]
 [145   5 456  54 206  30  55  13  34   2]
 [ 82  11 215 246 162 109 101  14  52   8]
 [ 92   4 259  40 489  18  43  14  40   1]
 [ 72   4 214 151 166 266  64  14  43   6]
 [ 36   4 259  74 285  27 288   1  25   1]
 [116  10 155  50 259  58  38 267  37  10]
 [154  20  47  33  43  17  10   6 662   8]
 [166  90  71  40  91  30  46  27 213 226]]
Wall time: 18min 16s


### Classification Report

In [20]:
print(classification_report(y_test_cifar, y_test_predicted))

print("\n")

              precision    recall  f1-score   support

           0       0.37      0.58      0.45      1000
           1       0.65      0.29      0.40      1000
           2       0.24      0.46      0.32      1000
           3       0.33      0.25      0.28      1000
           4       0.26      0.49      0.34      1000
           5       0.44      0.27      0.33      1000
           6       0.40      0.29      0.34      1000
           7       0.70      0.27      0.39      1000
           8       0.45      0.66      0.54      1000
           9       0.72      0.23      0.34      1000

    accuracy                           0.38     10000
   macro avg       0.46      0.38      0.37     10000
weighted avg       0.46      0.38      0.37     10000





## Explain why your K-NN model was unable to obtain high test accuracy on the CIFAR-10 image classification problem.

The inter-class distance is not significantly different from the intra-class distance for CIFAR-10 data. Compared to MNIST data, the background of CIFAR-10 data was not normalized and it is different from case to case. Besides, the larger variations of the shape of images in the same class also make it have lower test accuracy for CIFAR-10 data. There are a lot of variations in the background pixels across the images of the same object. Thus, a similarity-based approach (i.e., analogy-based approach), when applied pixel-wise, will not yield better performance to differentiate images belonging to different classes in the CIFAR-10 dataset.

## Why does a K-NN model perform excellent on the MNIST handwritten digits image classification problem?

The reasons are the background pixels in the MNIST images follow a silimar pattern in all images belonging to the same class. In the MNIST dataset, there exists a global pattern in the pixel distribution of the same digit across all images of its categor. Images are normalized to have the same size and are centered for MNIST and there is less variation in the distribution of the pixels of the same class.