In [None]:
import numpy as np
import pandas as pd
import struct
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
with open('data/train-images.idx3-ubyte','rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    nrows, ncols = struct.unpack(">II", f.read(8))
    data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
#     print(data.shape)
    Xtraindata = np.transpose(data.reshape((size, nrows*ncols)))

with open('data/train-labels.idx1-ubyte','rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
    ytrainlabels = data.reshape((size,)) # (Optional)

with open('data/t10k-images.idx3-ubyte','rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    nrows, ncols = struct.unpack(">II", f.read(8))
    data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
    Xtestdata = np.transpose(data.reshape((size, nrows*ncols)))

with open('data/t10k-labels.idx1-ubyte','rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
    ytestlabels = data.reshape((size,)) # (Optional)

print(Xtraindata.shape)
print(ytrainlabels.shape)
print(Xtestdata.shape)
print(ytestlabels.shape)

In [None]:
def plot_digits(XX, N, title, text):
    fig, ax = plt.subplots(N, N, figsize=(8, 8))
    c = 0
    for i in range(N):
      for j in range(N):
        c += 1 
        ax[i,j].imshow(XX[(N)*i+j,:].reshape((28, 28)), cmap="Greys")
        ax[i,j].axis("off")
        ax[i,j].set_title(f'{text} {c}')
        
    fig.suptitle(title, fontsize=24)
plot_digits(Xtraindata.T, 8, "First 64 Training Images", "")

In [None]:
X_train = Xtraindata.T
X_test = Xtestdata.T
pca = PCA()
pca.fit(X_train)
train_img = pca.transform(X_train)
train_img.shape

In [None]:
principal_components = pca.components_
plot_digits(principal_components, 4, "First 16 Modes Images", "PC")

In [None]:
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
k = np.argmax(cumulative_explained_variance >= 0.85) + 1 

print(f"Number of PC modes needed to approximate 85% of the energy: {k}")

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(cumulative_explained_variance, marker='o', linestyle='-', color='b')

k_85 = np.argmax(cumulative_explained_variance >= 0.85) # last index of the components have 85% energy
plt.axhline(y=0.85, color='r', linestyle='--')
plt.axvline(x=k_85, color='r', linestyle='--')

plt.text(k_85, 0.5, f'  {k_85+1} Components', color = 'black')

plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance by Number of Principal Components')
plt.grid(True)

plt.show()


In [None]:
pca85 = PCA(n_components = k_85 + 1)
pca85.fit(X_train)

In [None]:
def project_data(X, pca):
    return pca.transform(X)[:,:]

In [None]:
# Task 3

def select_digits(X_train, y_train, X_test, y_test, digits):
    train_indices = np.isin(y_train, digits)
    test_indices = np.isin(y_test, digits)
    
    X_subtrain = X_train[train_indices]
    y_subtrain = y_train[train_indices]
    X_subtest = X_test[test_indices]
    y_subtest = y_test[test_indices]
    
    return X_subtrain, y_subtrain, X_subtest, y_subtest



In [None]:
# Task 4
digits = [1, 8]
X_subtrain, y_subtrain, X_subtest, y_subtest = select_digits(Xtraindata.T, ytrainlabels, Xtestdata.T, ytestlabels, digits)

X_train_projected = project_data(X_subtrain, pca85)
X_test_projected = project_data(X_subtest, pca85)
X_train_projected.shape

# Train Ridge classifier wich CV
classifier_18 = RidgeCV()
classifier_18.fit(X_train_projected, y_subtrain)
test_score_18 = classifier_18.score(X_test_projected, y_subtest)
print(f'Test accuracy of Ridge with CV between digit 1 and 8: {test_score_18}')

In [None]:
# Task 5
digits = [3,8]
X_subtrain, y_subtrain, X_subtest, y_subtest = select_digits(Xtraindata.T, ytrainlabels, Xtestdata.T, ytestlabels, digits)

X_train_projected = project_data(X_subtrain, pca85)
X_test_projected = project_data(X_subtest, pca85)
# print(X_train_projected.shape)

# Train Ridge classifier wich CV
classifier_38 = RidgeCV()
classifier_38.fit(X_train_projected, y_subtrain)
test_score_38 = classifier_38.score(X_test_projected, y_subtest)
print(f'Test accuracy of Ridge with CV between digit 3 and 8: {test_score_38}')

In [None]:
digits = [2,7]
X_subtrain, y_subtrain, X_subtest, y_subtest = select_digits(Xtraindata.T, ytrainlabels, Xtestdata.T, ytestlabels, digits)

X_train_projected = project_data(X_subtrain, pca85)
X_test_projected = project_data(X_subtest, pca85)
X_train_projected.shape

# Train Ridge classifier wich CV
classifier_27 = RidgeCV()
classifier_27.fit(X_train_projected, y_subtrain)
test_score_27 = classifier_27.score(X_test_projected, y_subtest)
print(f'Test accuracy of Ridge with CV between digit 2 and 7: {test_score_27}')

In [None]:
# Task 6

X_train_projected = project_data(X_train, pca85)
X_test_projected = project_data(X_test, pca85)
# X_train_projected.shape
# X_test_projected.shape

# Train Multi-class Ridge classifier
classifier_mul = RidgeClassifier()
classifier_mul.fit(X_train_projected, ytrainlabels)
test_score_mul = classifier_mul.score(X_test_projected, ytestlabels)
print(f'Test accuracy of Multi-class Ridge: {test_score_mul}')

# Train KNN classifier
k = 10
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train_projected, ytrainlabels)
test_score_knn = knn.score(X_test_projected, ytestlabels)
print(f'Test accuracy of KNN: {test_score_knn}')

# LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_projected, ytrainlabels)
test_score_lda = lda.score(X_test_projected, ytestlabels)
print(f'Test accuracy of LDA: {test_score_lda}')

In [None]:
# Task 7 Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

y_train_onehot = to_categorical(ytrainlabels, num_classes=10)
y_test_onehot = to_categorical(ytestlabels, num_classes=10)

model = Sequential([
    Dense(128, activation='relu', input_shape=(59,)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train_projected, y_train_onehot, epochs=20, batch_size=128, validation_split=0.2)
NN_test_loss, NN_test_acc = model.evaluate(X_test_projected, y_test_onehot)
print(f'Test accuracy of Neural Network: {NN_test_acc}')