In [1]:
import numpy as np
import pandas as pd

## Loading the Data

In [2]:
df = pd.read_csv('Wholesale customers data.csv')
df.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Channel           440 non-null    int64
 1   Region            440 non-null    int64
 2   Fresh             440 non-null    int64
 3   Milk              440 non-null    int64
 4   Grocery           440 non-null    int64
 5   Frozen            440 non-null    int64
 6   Detergents_Paper  440 non-null    int64
 7   Delicassen        440 non-null    int64
dtypes: int64(8)
memory usage: 27.6 KB


In [4]:
data = df.to_numpy()

In [5]:
def shifted(data):
    # subtract each feature vector from its mean
    shifted_data = data - np.mean(data,axis=0)
    return shifted_data

## Covariance Matrix

In [6]:
def calculate_cov(data):
    num_samples = data.shape[1] 
    cov = (data @ data.T)/num_samples
    return cov

In [7]:
standardized_data = shifted(data).T
standardized_data

array([[ 6.77272727e-01,  6.77272727e-01,  6.77272727e-01, ...,
         6.77272727e-01, -3.22727273e-01, -3.22727273e-01],
       [ 4.56818182e-01,  4.56818182e-01,  4.56818182e-01, ...,
         4.56818182e-01,  4.56818182e-01,  4.56818182e-01],
       [ 6.68702273e+02, -4.94329773e+03, -5.64729773e+03, ...,
         2.53070227e+03, -1.71029773e+03, -9.21329773e+03],
       ...,
       [-2.85793182e+03, -1.30993182e+03, -6.66931818e+02, ...,
        -2.63493182e+03, -2.03393182e+03, -3.00693182e+03],
       [-2.07493182e+02,  4.11506818e+02,  6.34506818e+02, ...,
         1.19595068e+04, -2.71349318e+03, -2.40449318e+03],
       [-1.86870455e+02,  2.51129545e+02,  6.31912955e+03, ...,
         3.42129545e+02,  6.00129545e+02, -1.47287045e+03]])

In [8]:
cov = calculate_cov(standardized_data)
cov

array([[ 2.18574380e-01,  2.24276860e-02, -9.99155176e+02,
         1.58789373e+03,  2.70173552e+03, -4.58052996e+02,
         1.41613175e+03,  7.37645351e+01],
       [ 2.24276860e-02,  5.98135331e-01,  5.40165553e+02,
         1.84085108e+02,  5.64971178e+01, -7.89220558e+01,
        -5.46106921e+00,  9.84976395e+01],
       [-9.99155176e+02,  5.40165553e+02,  1.59591393e+08,
         9.36046630e+06, -1.42147481e+06,  2.11883895e+07,
        -6.13385338e+06,  8.70747517e+06],
       [ 1.58789373e+03,  1.84085108e+02,  9.36046630e+06,
         5.43461719e+07,  5.09670882e+07,  4.43251525e+06,
         2.32354154e+07,  8.43870224e+06],
       [ 2.70173552e+03,  5.64971178e+01, -1.42147481e+06,
         5.09670882e+07,  9.01048535e+07, -1.85006764e+06,
         4.17999733e+07,  5.49477470e+06],
       [-4.58052996e+02, -7.89220558e+01,  2.11883895e+07,
         4.43251525e+06, -1.85006764e+06,  2.35142899e+07,
        -3.03740599e+06,  5.34017735e+06],
       [ 1.41613175e+03, -5.461069

## Calculate the Eigenvalues & Eigenvectors

In [9]:
def eigen(cov):
    eigen_values , eigen_vectors = np.linalg.eig(cov)
    # sort the eigenvectors and eigenvalues descending in terms of the eigenvalues
    sorted_index = np.argsort(eigen_values)[::-1]
    sorted_eigenvalues = eigen_values[sorted_index]
    sorted_eigenvectors = eigen_vectors[:,sorted_index]
    
    return sorted_eigenvalues,sorted_eigenvectors

## PCA

In [10]:
def pca(data,k,eigenvectors):
    top_eigenvectors = eigenvectors[:,:k]
    F = np.dot(top_eigenvectors.T,data)
    return F, top_eigenvectors

In [11]:
def calculate_error(F,top_eigenvectors,data):
    
    F_inverse = np.dot(top_eigenvectors,F) + np.mean(data,axis=0).reshape(-1,1)
    
    error = np.mean((data.T-F_inverse)**2)
    
    return error,F_inverse.T

In [12]:
eigenvalues, eigenvectors = eigen(cov)
print(f"eigenvalues: {eigenvalues}")
print(f"eigenvectors:\n{eigenvectors}")

eigenvalues: [1.64620913e+08 1.45121525e+08 2.50828422e+07 1.57679825e+07
 5.38050736e+06 2.19863238e+06 5.94275972e-01 1.21777800e-01]
eigenvectors:
[[-4.20223371e-06  2.42933998e-05 -1.40636434e-05 -3.48624788e-06
   6.86267242e-08  3.10033196e-05  4.54785678e-02  9.98965314e-01]
 [ 3.32885015e-06  5.75410266e-07 -1.81488052e-06 -1.10408563e-05
   1.33539849e-05 -2.41744091e-07  9.98965314e-01 -4.54785679e-02]
 [ 9.76536846e-01 -1.10613856e-01 -1.78557260e-01 -4.18764803e-02
  -1.59859967e-02  1.57631603e-02 -3.58651763e-06  3.81310708e-06]
 [ 1.21184070e-01  5.15802159e-01  5.09886754e-01 -6.45640468e-01
  -2.03235658e-01 -3.34918735e-02 -4.46809064e-06 -5.85188736e-06]
 [ 6.15403925e-02  7.64606385e-01 -2.75780885e-01  3.75460488e-01
   1.60291504e-01 -4.10938945e-01  3.88476930e-07 -8.18247967e-06]
 [ 1.52364619e-01 -1.87234545e-02  7.14200374e-01  6.46292316e-01
  -2.20186117e-01  1.32889796e-02  1.14587246e-05  1.24874250e-05]
 [-7.05417374e-03  3.65350762e-01 -2.04409871e-01  1

In [13]:
for k in range(8,0,-1):
    data_reduced, top_eigenvectors = pca(standardized_data,k,eigenvectors)
    error, inverse = calculate_error(data_reduced,top_eigenvectors,data)
    print(f"Error for {k}: {error}")
#     print(f"Top eigenvectors:\n {top_eigenvectors}")
#     print(f"Data:\n {data_reduced}")
    # print(f"Inverse:\n{inverse}")

Error for 8: 8.164908929736995e-23
Error for 7: 0.015222225238855192
Error for 6: 0.08950672197130735
Error for 5: 274829.1367156728
Error for 4: 947392.5572440795
Error for 3: 2918390.375563545
Error for 2: 6053745.648714563
Error for 1: 24193936.25458759


- Since the data has 8 features, so we have __8 Q Matrices__
- After using __MSE__ to compare between the original data and the inverse of the reduced data for each Q Matrix:
    - The best result that doesn't change the original feature vectors too much and at the same time reduce the dimensionality is  __6__