In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [57]:
class PCA():
    def __init__(self, n_components):
        self.n_components = n_components
        self.components = None
        self.mean = None
        self.eigenValues = None
    
    
    def fit(self, X):
        self.mean = np.mean(X, axis = 0)
        X = X - self.mean
        
        # co-varinace
        
        cov = np.cov(X.T)
        
        # eigen valuee, eigenvectors
        eigenvalues, eigenvectors = np.linalg.eig(cov)
        self.eigenValues = eigenvalues
        eigenvectors = eigenvectors.T
        
        idxs = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idxs]
        
        eigenvectors = eigenvectors[idxs]
        
        
        
        # store first n eigen vector
        self.components = eigenvectors[0:self.n_components]
    
    def transform(self, X):
        X = X - self.mean
        return np.dot(X, self.components.T)


# Dataset Implementation

In [60]:
# upload KC1 Dataset
kc1 = pd.read_csv("Dataset/kc1.csv")

In [61]:
kc1.shape

(2109, 22)

In [62]:
X = kc1.iloc[:,:-1]

In [63]:
Y = kc1.iloc[:,-1]

In [64]:
# Label encoding
from sklearn.preprocessing import LabelEncoder
labelencoder_X_1 = LabelEncoder()
y = labelencoder_X_1.fit_transform(Y)

In [65]:
#checking shape
Y.shape

(2109,)

In [72]:
# create instance of PCA class
pca = PCA(6)

In [73]:
pca.fit(X)
x_tranformData = pca.transform(X)

In [74]:
# check eigen values.. CO-Varaince (21/21) ...So 21 eigen values
pca.eigenValues.size

21

In [75]:
# Check the Eigen value variance
ab =  pca.eigenValues / pca.eigenValues.sum()

In [76]:
# First two components capture the maximum variance
print(ab.round(5))

[9.9988e-01 1.2000e-04 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00]


In [77]:
#x1 = x_tranformData[:, 0]
#x2 = x_tranformData[:, 1]
#plt.scatter(x1,x2,
#            c = y, edgecolor = 'none', alpha = 0.8,
#            cmap = plt.cm.get_cmap('viridis', 3))
#plt.xlabel("Principal component 1",)
#plt.ylabel("Principal component 2",)
#plt.show()

In [79]:
label = pd.DataFrame(data=y, columns=["target"])
df = pd.DataFrame(data=x_tranformData, columns=["x1", "x2","x3", "x4","x5", "x6"])

In [81]:
result = pd.concat([df, label], axis=1, sort=False)

In [84]:
result.to_csv('kc1_components.csv')

In [86]:
dataframe = pd.read_csv("kc1_components.csv")
dataframe.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,target
0,-5254.504658,-118.378089,1.831236,9.168051,-0.030717,0.510175,0
1,-5254.833514,-118.881833,2.754197,9.591833,-0.14812,0.680676,1
2,16174.14574,235.323286,-34.579359,2.084614,-2.428164,-0.081671,1
3,6216.253323,350.829399,8.319157,0.330956,-8.042878,0.434046,1
4,-2863.73595,80.624963,-7.631032,-6.89725,-4.618032,5.478207,1
