In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [28]:
class PCA():
    def __init__(self, n_components):
        self.n_components = n_components
        self.components = None
        self.mean = None
        self.eigenValues = None
    
    
    def fit(self, X):
        self.mean = np.mean(X, axis = 0)
        X = X - self.mean
        
        # co-varinace
        
        cov = np.cov(X.T)
        
        # eigen valuee, eigenvectors
        eigenvalues, eigenvectors = np.linalg.eig(cov)
        self.eigenValues = eigenvalues
        eigenvectors = eigenvectors.T
        
        idxs = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idxs]
        
        eigenvectors = eigenvectors[idxs]
        
        
        
        # store first n eigen vector
        self.components = eigenvectors[0:self.n_components]
    
    def transform(self, X):
        X = X - self.mean
        return np.dot(X, self.components.T)


# Dataset Implementation

In [29]:
# upload KC1 Dataset
pc4 = pd.read_csv("Dataset/pc4.csv")

In [30]:
pc4.shape

(1458, 38)

In [31]:
pc4.head()

Unnamed: 0,LOC_BLANK,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,...,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,c
0,17,11,5,2,8,20,6,0.25,10,2,...,25,0.11,53,49,23,12,57,31.25,24,False
1,2,9,3,0,1,16,5,0.56,6,2,...,14,0.36,13,24,7,14,14,10.0,9,False
2,2,5,1,1,1,6,3,0.17,2,3,...,7,0.13,16,28,9,14,23,10.53,18,False
3,4,5,1,0,0,8,3,0.3,4,2,...,10,0.19,13,16,10,9,16,0.0,10,False
4,7,5,1,3,0,0,3,0.15,0,0,...,10,0.11,26,46,7,7,28,15.0,20,False


In [8]:
X = pc4.iloc[:,:-1]

In [13]:
y = pc4.iloc[:,-1]

In [32]:
# Label encoding
from sklearn.preprocessing import LabelEncoder
labelencoder_X_1 = LabelEncoder()
y = labelencoder_X_1.fit_transform(y)

In [33]:
#checking shape
y.shape

(1458,)

In [34]:
# create instance of PCA class
pca = PCA(6)

In [35]:
pca.fit(X)
x_tranformData = pca.transform(X)

In [36]:
# check eigen values.. CO-Varaince (21/21) ...So 21 eigen values
pca.eigenValues.size

37

In [37]:
# Check the Eigen value variance
ab =  pca.eigenValues / pca.eigenValues.sum()

In [23]:
# First two components capture the maximum variance
print(ab.round(5))

[ 9.9987e-01  1.3000e-04  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 -0.0000e+00
 -0.0000e+00]


In [39]:
label = pd.DataFrame(data=y, columns=["target"])
df = pd.DataFrame(data=x_tranformData, columns=["x1", "x2","x3", "x4","x5", "x6"])

In [40]:
result = pd.concat([df, label], axis=1, sort=False)

In [41]:
result.to_csv('Pc4_components.csv')

In [43]:
dataframe = pd.read_csv("Pc4_components.csv")
dataframe.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,target
0,-12289.60328,-173.416956,26.819229,-18.972967,-7.558592,-2.600292,0
1,-17423.62225,116.710175,-15.565308,5.437656,-13.168089,6.341275,0
2,-17058.31323,84.755161,-12.9107,-1.177297,-0.321608,-3.553457,0
3,-18818.27333,134.42462,-25.44354,-2.736261,-8.966695,6.205931,0
4,-15968.73056,22.286403,1.256544,9.071922,6.486355,-6.806553,0
