In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [71]:
class PCA():
    def __init__(self, n_components):
        self.n_components = n_components
        self.components = None
        self.mean = None
        self.eigenValues = None
    
    
    def fit(self, X):
        self.mean = np.mean(X, axis = 0)
        X = X - self.mean
        
        # co-varinace
        
        cov = np.cov(X.T)
        
        # eigen valuee, eigenvectors
        eigenvalues, eigenvectors = np.linalg.eig(cov)
        self.eigenValues = eigenvalues
        eigenvectors = eigenvectors
        
        idxs = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idxs]
        eigenvectors = eigenvectors[idxs]
        
        
        
        # store first n eigen vector
        self.components = eigenvectors[0:self.n_components]
    
    def transform(self, X):
        X = X - self.mean
        return np.dot(X, self.components.T)


# Dataset Implementation

In [72]:
# upload KC1 Dataset
pc3 = pd.read_csv("Dataset/pc3.csv")

In [73]:
pc3.shape

(1563, 38)

In [74]:
pc3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1563 entries, 0 to 1562
Data columns (total 38 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   LOC_BLANK                        1563 non-null   int64  
 1   BRANCH_COUNT                     1563 non-null   int64  
 2   CALL_PAIRS                       1563 non-null   int64  
 3   LOC_CODE_AND_COMMENT             1563 non-null   int64  
 4   LOC_COMMENTS                     1563 non-null   int64  
 5   CONDITION_COUNT                  1563 non-null   int64  
 6   CYCLOMATIC_COMPLEXITY            1563 non-null   int64  
 7   CYCLOMATIC_DENSITY               1563 non-null   float64
 8   DECISION_COUNT                   1563 non-null   int64  
 9   DECISION_DENSITY                 1563 non-null   float64
 10  DESIGN_COMPLEXITY                1563 non-null   int64  
 11  DESIGN_DENSITY                   1563 non-null   float64
 12  EDGE_COUNT          

In [75]:
X = pc3.iloc[:,:-1]

In [76]:
Y = pc3.iloc[:,-1]

In [77]:
# Label encoding
from sklearn.preprocessing import LabelEncoder
labelencoder_X_1 = LabelEncoder()
y = labelencoder_X_1.fit_transform(Y)

In [78]:
#checking shape
y.shape

(1563,)

In [79]:
from sklearn.preprocessing import StandardScaler
x_std = StandardScaler().fit_transform(X)

In [80]:
# create instance of PCA class
pca = PCA(6)

In [81]:
pca.fit(x_std)
x_tranformData = pca.transform(x_std)

In [82]:
# check eigen values.. CO-Varaince (21/21) ...So 21 eigen values
pca.eigenValues.size

37

In [84]:
# Check the Eigen value variance
ab =  (pca.eigenValues / pca.eigenValues.sum()) * 100

In [88]:
np.absolute(np.sort(-ab))

array([4.97302118e+01, 1.06470139e+01, 8.32875252e+00, 5.07270543e+00,
       4.56665098e+00, 4.07323744e+00, 3.41866161e+00, 2.25926502e+00,
       2.10203000e+00, 1.80567008e+00, 1.54888988e+00, 1.34408586e+00,
       1.13255287e+00, 8.34253093e-01, 6.46610201e-01, 5.22135292e-01,
       4.94543202e-01, 4.14539851e-01, 3.38659209e-01, 2.28329102e-01,
       1.52102833e-01, 1.50361201e-01, 6.72959596e-02, 4.22137163e-02,
       2.71461433e-02, 1.53591646e-02, 1.19692663e-02, 9.76999727e-03,
       8.29902601e-03, 5.94315877e-03, 7.32067946e-04, 1.01197425e-05,
       3.25074712e-14, 1.45990476e-15, 5.42057596e-17, 5.20943279e-16,
       8.83438558e-16])