# Preparing data

1. Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

2. Load the dataset

In [8]:
dataset = np.load('C:/Users/annam/Desktop/ITU/3rd_sem/01_Machine_Learning/Project/ML_Project/datasets/fashion_train.npy')
print("Dataset shape: ", dataset.shape)

Dataset shape:  (10000, 785)


3. Separate the feature columns and the label column

In [38]:
X = dataset[:, :-1] # Define X as all columns except the last one
y = dataset[:, -1] # Define y as the last column

print("X shape: ", X.shape) # Print X shape (10000, 784)
print("y shape: ", y.shape) # Print y shape (10000,)

X shape:  (10000, 784)
y shape:  (10000,)


# Doing math for the LDA

Process:
1. Calculate the mean of each class: \
    $m_i = \frac{\sum x^t r^t}{\sum r^t}$ - sum of values of each observation / number of observations in the class
2. Calculate the within-scatter of each class - matrix of $k \times k$ \
     $S_i = \sum r^t_i(x^t - m_i)(x^t - m_i)^T$ \
     and sum it together: \
     $S_w = \sum s_i$ 
3. Calculate the overall mean \
    $M = \frac{1}{K}\sum_i$
4. Calculate the between class scatter (how much the class means are scattered around the overall mean) - same as calculating the covariance matrix ($k \times k$ matrix) \
    $S_b = \sum N_i (m_i - M)(m_i - M)^T$ \
    where $N_i = \sum r_i^t$ - the number of observation in $i^{th}$ class
5. Find the two eigenvectors corresponding to two largest eigenvalues
6. Project the data onto them
7. Plot



1. mean of each class

In [25]:
classes = set(y)

class_means = np.array([np.mean(X[y == i], axis=0) for i in classes])   
#class_means

2. within scatter

In [42]:
#initialize the k x k matrix
scatter_within = np.zeros((X.shape[1], X.shape[1])) 

#calculate the scatter within 
for i in range(len(classes)):
    scatter_within += np.dot( (X[y == i] - class_means[i]).T,(X[y == i] - class_means[i])) 
scatter_within = scatter_within / X.shape[0]  

3. Central point (mean of the means)

In [44]:
central_point = np.mean(class_means, axis=0)

4. between class scatter 