# Example of PCA using the correlation matrix

Author: Edrei Santos

In [138]:
# Importing data and libraries
import numpy as np
import polars as pl
import scipy as sp

data = pl.read_excel("dados_brutos_cap3_mingoti.xlsx")
data = data.drop(data.columns[0])
data = data.to_numpy()

print(data)

[[ 9893   564 17689]
 [ 8776   389 17359]
 [13572  1103 18597]
 [ 6455   743  8745]
 [ 5129   203 14397]
 [ 5432   215  3467]
 [ 3807   385  4679]
 [ 3423   187  6754]
 [ 3708   127  2275]
 [ 3294   297  6754]
 [ 5433   432  5589]
 [ 6287   451  8972]]


I will now check if the correlation matrix of the data pass a T-test at 5% of significance, or α = 0.05.

In [139]:
corr_matrix = np.corrcoef(data, rowvar= False)
print("Correlation Matrix \n\n",corr_matrix)
p_values = np.empty((3,3))
for j in range(data.shape[1]):
    for i in range(data.shape[1]):
        temp = sp.stats.pearsonr(data[:,j], data[:,i])
        p_values[j,i] = temp[1]
print("\n P Values Matrix \n\n",p_values)

Correlation Matrix 

 [[1.         0.82734786 0.82625598]
 [0.82734786 1.         0.57650285]
 [0.82625598 0.57650285 1.        ]]

 P Values Matrix 

 [[0.         0.00089716 0.00092409]
 [0.00089716 0.         0.04974535]
 [0.00092409 0.04974535 0.        ]]


As we can see, all Correlation Values passed the α = 0.05 significance test. Next step is to get the eigvectors and values of the correlation matrix.

In [140]:
eigval , eigvec = np.linalg.eig(corr_matrix)
print("Eigenvalues: ", eigval)
print("Eigenvectors: \n", eigvec)

Eigenvalues:  [2.49253211 0.08396976 0.42349813]
Eigenvectors: 
 [[ 0.61670267  0.78719515 -0.00126721]
 [ 0.55679445 -0.43733949 -0.70619694]
 [ 0.556469   -0.43480796  0.70801432]]


The selection method of getting the components is choosing the vectors that have values higher than 1. So we will only use the component of highest eigenvalue (2.5).

Before we apply the vector to the dataset, we need to standardize the numbers of the matrix

In [141]:
data_means = np.mean(data,0)
data_std = np.std(data,0)
data = (data - data_means) / data_std
print(data)

[[ 1.2253401   0.52695535  1.43917645]
 [ 0.84782709 -0.13489048  1.38041699]
 [ 2.46873346  2.5654405   1.60085399]
 [ 0.06339763  1.20392908 -0.15338296]
 [-0.38475122 -0.83833805  0.85300633]
 [-0.28234616 -0.79295434 -1.0931782 ]
 [-0.83154818 -0.15001839 -0.87737073]
 [-0.96132884 -0.89884967 -0.50789837]
 [-0.86500725 -1.12576824 -1.30542449]
 [-1.00492703 -0.48283229 -0.50789837]
 [-0.28200819  0.02773449 -0.71533707]
 [ 0.00661859  0.09959204 -0.11296358]]


Now that the data is standardized, I can apply the eigenvector of the component in the data.

In [142]:
data = data @ eigvec[:,0]
print(data)

[ 1.84993341e+00  1.21591023e+00  3.84172319e+00  6.24085655e-01
 -2.29387498e-01 -1.22395599e+00 -1.08457700e+00 -1.37595827e+00
 -1.88670206e+00 -1.17120922e+00 -5.56535700e-01 -3.32673327e-03]


In [143]:
df = {"Company": range(data.shape[0]), "Scores": data}
data = pl.DataFrame(df)
data = data.with_columns(pl.col("Company") + 1)
data.sort("Scores", descending= True)

Company,Scores
i64,f64
3,3.841723
1,1.849933
2,1.21591
4,0.624086
12,-0.003327
…,…
7,-1.084577
10,-1.171209
6,-1.223956
8,-1.375958


By this method, company 3 is the best, and company 9 is the worst.