In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [17]:
class PCA():
    def __init__(self, n_components):
        self.n_components = n_components
        self.components = None
        self.mean = None
        self.eigenValues = None
        self.cov_variance = None
    
    
    def fit(self, X):
        self.mean = np.mean(X, axis = 0)
        X = X - self.mean
        
        # co-varinace
        
        cov = np.cov(X.T)
        self.cov_variance = cov
        # eigen valuee, eigenvectors
        eigenvalues, eigenvectors = np.linalg.eig(cov)
        self.eigenValues = eigenvalues
        eigenvectors = eigenvectors
        
        idxs = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idxs]
        
        eigenvectors = eigenvectors[idxs]
        
        
        
        # store first n eigen vector
        self.components = eigenvectors[0:self.n_components]
    
    def transform(self, X):
        X = X - self.mean
        return np.dot(X, self.components.T)


# Dataset Implementation

In [18]:
# upload KC1 Dataset
jm1 = pd.read_csv("Dataset/jm1.csv")

In [19]:
jm1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10885 entries, 0 to 10884
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   loc                10885 non-null  float64
 1   v(g)               10885 non-null  float64
 2   ev(g)              10885 non-null  float64
 3   iv(g)              10885 non-null  float64
 4   n                  10885 non-null  float64
 5   v                  10885 non-null  float64
 6   l                  10885 non-null  float64
 7   d                  10885 non-null  float64
 8   i                  10885 non-null  float64
 9   e                  10885 non-null  float64
 10  b                  10885 non-null  float64
 11  t                  10885 non-null  float64
 12  lOCode             10885 non-null  int64  
 13  lOComment          10885 non-null  int64  
 14  lOBlank            10885 non-null  int64  
 15  locCodeAndComment  10885 non-null  int64  
 16  uniq_Op            108

In [20]:
indexNames = jm1[ jm1['uniq_Opnd'] == "?" ].index
jm1.drop(indexNames , inplace=True)

In [21]:
indexNames = jm1[ jm1['total_Op'] == "?" ].index
jm1.drop(indexNames , inplace=True)

In [22]:
indexNames = jm1[ jm1['total_Opnd'] == "?" ].index
jm1.drop(indexNames , inplace=True)

In [23]:
indexNames = jm1[ jm1['branchCount'] == "?" ].index
jm1.drop(indexNames , inplace=True)

In [24]:
jm1.shape

(10880, 22)

In [25]:
jm1["branchCount"] = jm1["branchCount"].astype(str).astype(float)

In [26]:
jm1["uniq_Op"] = jm1["uniq_Op"].astype(str).astype(float)
jm1["uniq_Opnd"] = jm1["uniq_Opnd"].astype(str).astype(float)
jm1["total_Op"] = jm1["total_Op"].astype(str).astype(float)
jm1["total_Opnd"] = jm1["total_Opnd"].astype(str).astype(float)

In [27]:
jm1.info()    

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10880 entries, 0 to 10884
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   loc                10880 non-null  float64
 1   v(g)               10880 non-null  float64
 2   ev(g)              10880 non-null  float64
 3   iv(g)              10880 non-null  float64
 4   n                  10880 non-null  float64
 5   v                  10880 non-null  float64
 6   l                  10880 non-null  float64
 7   d                  10880 non-null  float64
 8   i                  10880 non-null  float64
 9   e                  10880 non-null  float64
 10  b                  10880 non-null  float64
 11  t                  10880 non-null  float64
 12  lOCode             10880 non-null  int64  
 13  lOComment          10880 non-null  int64  
 14  lOBlank            10880 non-null  int64  
 15  locCodeAndComment  10880 non-null  int64  
 16  uniq_Op            108

In [28]:
X = jm1.iloc[:,:-1]

In [29]:
Y = jm1.iloc[:,-1]

In [30]:
# Label encoding
from sklearn.preprocessing import LabelEncoder
labelencoder_X_1 = LabelEncoder()
y = labelencoder_X_1.fit_transform(Y)

In [31]:
#checking shape
y.shape

(10880,)

In [39]:
from sklearn.preprocessing import StandardScaler
x_std = StandardScaler().fit_transform(X)

In [41]:
# create instance of PCA class
pca = PCA(8)

In [44]:
pca.fit(x_std)
x_tranformData = pca.transform(x_std)

In [45]:
# check eigen values.. CO-Varaince (21/21) ...So 21 eigen values
pca.eigenValues.size

21

In [46]:
# Check the Eigen value variance
ab =  pca.eigenValues / pca.eigenValues.sum()

In [47]:
# First two components capture the maximum variance
print(ab.round(5))

[6.4659e-01 7.7950e-02 6.2360e-02 4.5860e-02 3.9520e-02 3.5870e-02
 2.7970e-02 1.9630e-02 1.3030e-02 1.0070e-02 7.6600e-03 5.7800e-03
 2.8900e-03 1.7000e-03 1.7600e-03 7.2000e-04 4.1000e-04 1.9000e-04
 1.0000e-05 1.0000e-05 0.0000e+00]


In [52]:
a = pca.cov_variance

In [53]:
np.size(a)

441