In [27]:
import numpy as np
import pandas as pd

In [28]:
from sklearn.datasets import make_blobs

X, _ = make_blobs(n_samples=100, centers=5, n_features=5, cluster_std=2.5, random_state=42)
X = pd.DataFrame(X)
X.columns = [f'col_{col}' for col in X.columns]

In [263]:
#PRINCIPAL COMPONENT ANALYSIS
#Class parameters: n_components – number of main components
#PCA is done by computing the eigenvectors and eigenvalues of the covariance matrix 
#to identify the principal components.

class MyPCA():
    
    def __init__(self, n_components=3):
        self.n_components = n_components
        
    def __repr__(self):
        return f'MyPCA class: n_components={self.n_components}'
    
    def fit_transform(self, X): #receives pandas dataframes of features
        
        #Convert dataframe into numpy array
        X_numpy = X.to_numpy()
        
        #Normalize the data
        X_norm = self.normalize(X_numpy)
        
        #Calculate covariance matrix
        cov_mat = self.covariance_matrix(X_norm)
        
        #Compute eigenvalues and eigenvectors of covariance matrix
        #W is eigenvector, and L is eigenvalue if cov_mat*W = L*W
        L, W = np.linalg.eigh(cov_mat)
        
        #Sort eigenvalues with correspoing eigenvectors in descending order
        idx_sorted = np.argsort(L)[::-1]
        L_sort, W_sort = L[idx_sorted], W[:, idx_sorted] #!!!
        
        #First N = n_components eigenvectors are the principal components
        Wpca = W_sort[:,:self.n_components]
        
        #Reducing the data dimensions X_reduced = X_norm*Wpca
        X_reduced = pd.DataFrame(np.dot(X.values, Wpca))

        return  X_reduced
        
    #Normalizes data
    def normalize(self, X): #receives numpy arrat
        for j in range(X.shape[1]):
            X[:,j] -= np.mean(X[:,j])
        return X
    
    #Calculated covariance matris
    def covariance_matrix(self, X):
        cov_mat = np.zeros([X.shape[1], X.shape[1]])
        for i in range(X.shape[1]):
            for j in range(X.shape[1]):
                if i == j:
                    cov_mat[i,j] = np.sum((X[:,i]-np.mean(X[:,i]))**2)/(X.shape[0]-1)
                else:
                    cov_mat[i,j] = np.sum((X[:,i]-np.mean(X[:,i]))*(X[:,j]-np.mean(X[:,j])))/(X.shape[0]-1)
        return cov_mat

In [265]:
test = MyPCA()
test.fit_transform(X)

Unnamed: 0,0,1,2
0,1.815826,2.771472,-0.063183
1,4.254430,3.480621,4.506728
2,-1.120616,3.527066,-7.895402
3,1.792970,2.693932,-5.732699
4,3.500805,2.632063,-3.226070
...,...,...,...
95,3.494835,5.227439,1.026015
96,5.339353,0.807903,6.341257
97,4.088706,-0.614011,2.438080
98,-3.076733,7.780669,-0.590571
