# This PCA implementation is based on the TA (PCA) session of Chrisanna Cornish on 17/11/2023

# Cornish, C. K. (2023). Principal component analysis. Excercise Session 21, IT University of Copenhagen, 17/11/202

# Imports and plotting matter

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)
plt.rcParams.update({'text.usetex': True}) # import to make plots nicer
%config InlineBackend.figure_format = 'svg' # import to make plots nicer
# if program says latex error, please mute the above line (8) 

# Set global font size for title, x-label, and y-label
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['axes.labelsize'] = 16
# Set global font size for x and y tick labels
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16
# Set global font size for the legend
plt.rcParams['legend.fontsize'] = 16

# Loading the datset 

In [None]:
# loading the training set 
train = np.load('fashion_train.npy')

# Splitting the dataset into features and labels

In [None]:
# train set splitted into observations and labels 
Y_train = train[:,-1] 
X_train = train[:,:-1] 

# Replace the integer labels with strings

In [None]:
# https://www.geeksforgeeks.org/replacing-strings-with-numbers-in-python-for-data-analysis/
fashion_dict = {0: 'T-shirt-top', 1: 'Trouser', 2: 'Pullover', 3: 'Dress',
        4: 'Shirt'}
Y_train = [fashion_dict[i] for i in Y_train]

# Data preprocessing 

In [None]:
#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from sklearn.preprocessing import StandardScaler

In [None]:
stdscalar = StandardScaler()

In [None]:
X_train_std = stdscalar.fit_transform(X_train)

Our goal is to maximize Var$(a_1^T X) = a_1^T S a_1 = a_1^T \lambda_1 a_1 $ \\ 
Where $S$ is the covariance matrix, $a_1$ is the eigenvector and $\lambda_1$ is the eigenvalue

# Calculating the covariance matrix 

In [None]:
cov_mat = np.cov(X_train_std.T)

# Eigenvalue decomposition

In [None]:
# save the eigenvalues and eigenvectors
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)

# sorting the eigenvalues and eigenvectors in desceding order
order = np.argsort(eigen_vals)[::-1]

# ordering the matrices, so we can have the index of the largest one
sort_eigen_vals = eigen_vals[order]
sort_eigen_vecs = eigen_vecs[:,order]

In [None]:
# extracting first 5 eigenvalues
sort_eigen_vals = sort_eigen_vals[:5]
sort_eigen_vals

In [None]:
# principal components projection
pcs = X_train_std @ sort_eigen_vecs

In [None]:
df = pd.DataFrame(pcs)

# scatter plot of the first two principal components

In [None]:
# extracting the first two principal components
PCA = pcs[:,:2]

In [None]:
# saving the first two principal components in a dataframe
principal_df = pd.DataFrame(PCA, columns=['PC1', 'PC2'])

In [None]:
# save principal_df as a csv file
#principal_df.to_csv('principal_df.csv')

In [None]:
# plotting the scatterplot of PC1 and PC2
plt.figure(figsize = (8,8))
sns.scatterplot(data = principal_df, x ='PC1', y ='PC2', palette='nipy_spectral', hue=Y_train, alpha=0.5, s=50)
plt.legend(bbox_to_anchor=(0.70, 0.72))
plt.title('PC1 and PC2 of the Fashion MNIST dataset (from scratch)')
plt.savefig('PCA_from_scratch.png', dpi=300, bbox_inches='tight')
plt.show();

# Plotting the pair plot of the first five principal components

In [None]:
# extracting the first five principal components for the pairplor 
PCA5 = pcs[:,:5]
# saving the first five principal components in a dataframe
PCA_5 = pd.DataFrame(PCA5, columns=['PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5'])
# adding label column to the dataframe
PCA_5['Label'] = Y_train

In [None]:
# plotting
plt.figure(figsize = (8,8))
g = sns.pairplot(PCA_5, hue='Label', palette='nipy_spectral', plot_kws={"s": 20, "alpha": 0.5}, corner=True)  
plt.suptitle('Pairplot of the 5 PCA components of the MNIST-Fashion dataset', fontsize=20)
sns.move_legend(g, bbox_to_anchor=(0.85, 0.65), loc='center right')
# https://stackoverflow.com/questions/71907322/seaborn-pariplot-how-to-move-legend-and-set-style
plt.savefig('PCA_Pair.png', dpi=300, bbox_inches='tight')
plt.show();

# visualizing the first 5 eigenvectors and their corresponding eigenvalues 


In [None]:
first_five_evec = sort_eigen_vecs[:,:5]

In [None]:
# extracting and saving the first five eigenvectors
e_vec_1 = first_five_evec[:,0].reshape(28,28)
e_vec_2 = first_five_evec[:,1].reshape(28,28)
e_vec_3 = first_five_evec[:,2].reshape(28,28)
e_vec_4 = first_five_evec[:,3].reshape(28,28)
e_vec_5 = first_five_evec[:,4].reshape(28,28)

In [None]:
# https://github.com/MrDataScience/tutorials/blob/master/Data/MNIST/How%20To%20Plot%20MNIST%20Digits%20Using%20Matplotlib.ipynb
# https://complex-valued-neural-networks.readthedocs.io/en/latest/code_examples/fashion_mnist.html
fig, axs = plt.subplots(1,5, figsize=(15,5))
plt.suptitle("Visualization of the first 5 eigenvectors", fontsize=20, y=0.85)
axs[0].imshow(e_vec_1, cmap='binary')
axs[0].set_xticks([])
axs[0].set_yticks([])
axs[0].set_title(f'$\lambda_1$ = {sort_eigen_vals[0]:.2f}', fontsize=16)

axs[1].imshow(e_vec_2, cmap='binary')
axs[1].set_xticks([])
axs[1].set_yticks([])
axs[1].set_title(f'$\lambda_2$ = {sort_eigen_vals[1]:.2f}', fontsize=16)

axs[2].imshow(e_vec_3, cmap='binary')
axs[2].set_xticks([])
axs[2].set_yticks([])
axs[2].set_title(f'$\lambda_3$ = {sort_eigen_vals[2]:.2f}', fontsize=16)

axs[3].imshow(e_vec_4, cmap='binary')
axs[3].set_xticks([])
axs[3].set_yticks([])
axs[3].set_title(f'$\lambda_4$ = {sort_eigen_vals[3]:.2f}', fontsize=16)

axs[4].imshow(e_vec_5, cmap='binary')
axs[4].set_xticks([])
axs[4].set_yticks([])
axs[4].set_title(f'$\lambda_5$ = {sort_eigen_vals[4]:.2f}', fontsize=16)
plt.savefig('eigen_images.png', dpi=300, bbox_inches='tight')
plt.show();

In [None]:
# list of obervations that have been found by conditional filtering with pandas module (see the notebook 1.PCA_05.ipynb, the files have been separated for more clarity)
list = [198, 35, 3607, 162, 30, 1625, 100, 585, 1230]

In [None]:
# https://github.com/MrDataScience/tutorials/blob/master/Data/MNIST/How%20To%20Plot%20MNIST%20Digits%20Using%20Matplotlib.ipynb
# https://complex-valued-neural-networks.readthedocs.io/en/latest/code_examples/fashion_mnist.html
fig, axs = plt.subplots(3,3, figsize=(10,10))
img0 = X_train[list[0]].reshape(28,28)
pca_img0 = principal_df.loc[list[0]]
axs[0,0].imshow(img0, cmap='binary')
axs[0,0].set_xticks([])
axs[0,0].set_yticks([])
axs[0,0].set_title(f"{1}) PCA1: {pca_img0[0]:.2f}, PCA2: {pca_img0[1]:.2f}", fontsize=16)

img1 = X_train[list[1]].reshape(28,28)
pca_img1 = principal_df.loc[list[1]]
axs[0,1].imshow(img1, cmap='binary')
axs[0,1].set_xticks([])
axs[0,1].set_yticks([])
axs[0,1].set_title(f"{2}) PCA1: {pca_img1[0]:.2f}, PCA2: {pca_img1[1]:.2f}", fontsize=16)

img2 = X_train[list[2]].reshape(28,28)
pca_img2 = principal_df.loc[list[2]]
axs[0,2].imshow(img2, cmap='binary')
axs[0,2].set_xticks([])
axs[0,2].set_yticks([])
axs[0,2].set_title(f"{3}) PCA1: {pca_img2[0]:.2f}, PCA2: {pca_img2[1]:.2f}", fontsize=16)

img3 = X_train[list[3]].reshape(28,28)
pca_img3 = principal_df.loc[list[3]]
axs[1,0].imshow(img3, cmap='binary')
axs[1,0].set_xticks([])
axs[1,0].set_yticks([])
axs[1,0].set_title(f"{4}) PCA1: {pca_img3[0]:.2f}, PCA2: {pca_img3[1]:.2f}", fontsize=16)

img4 = X_train[list[4]].reshape(28,28)
pca_img4 = principal_df.loc[list[4]]
axs[1,1].imshow(img4, cmap='binary')
axs[1,1].set_xticks([])
axs[1,1].set_yticks([])
axs[1,1].set_title(f"{5}) PCA1: {pca_img4[0]:.2f}, PCA2: {pca_img4[1]:.2f}", fontsize=16)

img5 = X_train[list[5]].reshape(28,28)
pca_img5 = principal_df.loc[list[5]]
axs[1,2].imshow(img5, cmap='binary')
axs[1,2].set_xticks([])
axs[1,2].set_yticks([])
axs[1,2].set_title(f"{6}) PCA1: {pca_img5[0]:.2f}, PCA2: {pca_img5[1]:.2f}", fontsize=16)

img6 = X_train[list[6]].reshape(28,28)
pca_img6 = principal_df.loc[list[6]]
axs[2,0].imshow(img6, cmap='binary')
axs[2,0].set_xticks([])
axs[2,0].set_yticks([])
axs[2,0].set_title(f"{7}) PCA1: {pca_img6[0]:.2f}, PCA2: {pca_img6[1]:.2f}", fontsize=16)

img7 = X_train[list[7]].reshape(28,28)
pca_img7 = principal_df.loc[list[7]]
axs[2,1].imshow(img7, cmap='binary')
axs[2,1].set_xticks([])
axs[2,1].set_yticks([])
axs[2,1].set_title(f"{8}) PCA1: {pca_img7[0]:.2f}, PCA2: {pca_img7[1]:.2f}", fontsize=16)

img8 = X_train[list[8]].reshape(28,28)
pca_img8 = principal_df.loc[list[8]]
axs[2,2].imshow(img8, cmap='binary')
axs[2,2].set_xticks([])
axs[2,2].set_yticks([])
axs[2,2].set_title(f"{9}) PCA1: {pca_img8[0]:.2f}, PCA2: {pca_img8[1]:.2f}", fontsize=16)

plt.savefig(f"listof_dots.jpg", dpi=300, bbox_inches='tight')
fig.subplots_adjust(hspace=0.2)
plt.savefig('listof_dots.png', dpi=300, bbox_inches='tight')
plt.show();

In [None]:
# use the defined list to get pca1 and pca2 coordinates
coordinate_list = []
for i in list:
        coordinate_list.append((principal_df.loc[i][0], principal_df.loc[i][1]))


In [None]:
coordinate_list

In [None]:
# save the coordiante list of tuples as a list of strings
labels = []
for i in range(len(coordinate_list)):
    labels.append(f"{i + 1}) PCA1: {coordinate_list[i][0]:.2f}, PCA2: {coordinate_list[i][1]:.2f}")
labels

In [None]:
import matplotlib.lines as mlines
# https://matplotlib.org/stable/api/_as_gen/matplotlib.lines.Line2D.html
# https://stackoverflow.com/questions/47391702/how-to-make-a-colored-markers-legend-from-scratch
# https://matplotlib.org/stable/users/explain/axes/legend_guide.html
one = mlines.Line2D([], [], color='black', marker='s', ls='')
two = mlines.Line2D([], [], color='black', marker='D', ls='')
three = mlines.Line2D([], [], color='black', marker='o', ls='')
four = mlines.Line2D([], [], color='black', marker='^', ls='')
five = mlines.Line2D([], [], color='black', marker='*', ls='')
six = mlines.Line2D([], [], color='black', marker='P', ls='')
seven = mlines.Line2D([], [], color='black', marker='X', ls='')
eight = mlines.Line2D([], [], color='black', marker='p', ls='')
nine = mlines.Line2D([], [], color='black', marker='<', ls='')
list_of_markers=[one, two, three, four, five, six, seven, eight, nine]


In [None]:
# https://matplotlib.org/stable/api/_as_gen/matplotlib.lines.Line2D.html
# https://stackoverflow.com/questions/47391702/how-to-make-a-colored-markers-legend-from-scratch
# https://matplotlib.org/stable/users/explain/axes/legend_guide.html
# plot markers of the selected clothing
fig, ax = plt.subplots(figsize=(8,8))
sns.scatterplot(data = principal_df, x ='PC1', y ='PC2', palette='nipy_spectral', hue=Y_train, alpha=0.5, s=50)
plt.title('PC1 and PC2 of the Fashion MNIST dataset')
plt.scatter(coordinate_list[0][0], coordinate_list[0][1], color='black', edgecolors='white', marker='s', s=150)
plt.scatter(coordinate_list[1][0], coordinate_list[1][1], color='black', edgecolors='white',marker='D', s=150)
plt.scatter(coordinate_list[2][0], coordinate_list[2][1], color='black', edgecolors='white',marker='o', s=150)
plt.scatter(coordinate_list[3][0], coordinate_list[3][1], color='black', edgecolors='white',marker='^', s=150)
plt.scatter(coordinate_list[4][0], coordinate_list[4][1], color='black', edgecolors='white',marker='*', s=150)
plt.scatter(coordinate_list[5][0], coordinate_list[5][1], color='black', edgecolors='white',marker='P', s=150)
plt.scatter(coordinate_list[6][0], coordinate_list[6][1], color='black', edgecolors='white',marker='X', s=150)
plt.scatter(coordinate_list[7][0], coordinate_list[7][1], color='black', edgecolors='white',marker='p', s=150)
plt.scatter(coordinate_list[8][0], coordinate_list[8][1], color='black', edgecolors='white',marker='<', s=150)
plt.legend(labels=labels, handles=list_of_markers, fontsize=14, bbox_to_anchor=(0.8, 1))
plt.savefig(f"PCA_dotted.jpg", dpi=300, bbox_inches='tight')
plt.show();