In [1]:
# we will only need numpy library to implement
import numpy as np

In [2]:
#generate a dummy dataset
X = np.random.randint(10,50,100).reshape(20,5)
# print(X)
# mean centering the data
X_meaned = X - np.mean(X, axis = 0)
# print(X_meaned)

In [3]:
# calculate the covariance matrix of the mean-centered data
# here 'rowvar' is set to False, to indicate that each
# column IS a variable, and not each row which is the default value
cov_mat = np.cov(X_meaned, rowvar = False)
#print(cov_mat)

In [4]:
# calculating eigenvalues and eigenvectors of cpv_mat
eigen_val, eigen_vec = np.linalg.eigh(cov_mat)
# print(eigen_val)
# print(eigen_vec)

In [5]:
# sort eigenvalues in a descending order, argsort returns an
# array of indices of the same shape
sorted_index = np.argsort(eigen_val)[::-1]
sorted_eigenval = eigen_val[sorted_index]
# sort eigenvectors based on their repective eignvalues orders
sorted_eigenvec = eigen_vec[:,sorted_index]
# print(sorted_eigenvec)

In [6]:
# slecect the fist n eigenvectors, n is the desired dimension
# of our reduced data, you can select any number of n

n_components = 2 # we choose 2 here for example
eigenvec_subset = sorted_eigenvec[:,0:n_components]

In [7]:
print(eigenvec_subset.shape)
print(X_meaned.shape)

(5, 2)
(20, 5)


In [8]:
# Before transforming the data, we need to make sure
# dimensions are aligned. So we transpose the eigenvectors
# subset matrix and the mean-centered data matrix and take the dot
# product of the two matrices. Then, we take the transpose of the
# output matrix so we have the dimension (20, 2), 
# which indicates 20 examples and 2 principal components. 
X_reduced = np.dot(eigenvec_subset.transpose(),X_meaned.transpose()).transpose()
print(X_reduced.shape)


(20, 2)


## PCA function defintion (6 steps)
Now, it's time to define a function, using only numpy library, to reuse on real-life applications of dimensionality reduction.

The function takes two arguments as inputs, X and num_components. X is the data matrix and n_components is the numeber of principal components we want to retain for use in our machine algorithm.


In [9]:
# import numpy as np
 
def my_PCA(X , num_components):
     
    #Step-1
    X_meaned = X - np.mean(X , axis = 0)
     
    #Step-2
    cov_mat = np.cov(X_meaned , rowvar = False)
     
    #Step-3
    eigen_val , eigen_vec = np.linalg.eigh(cov_mat)
     
    #Step-4
    sorted_index = np.argsort(eigen_val)[::-1]
    sorted_eigenval = eigen_val[sorted_index]
    sorted_eigenvec = eigen_vec[:,sorted_index]
     
    #Step-5
    eigenvec_subset = sorted_eigenvec[:,0:num_components]
     
    #Step-6
    X_reduced = np.dot(eigenvec_subset.transpose() , X_meaned.transpose() ).transpose()
     
    return X_reduced