In [2]:
# Covariance matrix from scratch

def covariance(x, y): # finds covariance between 2 features x and y.
  sum = 0
  mean_x = np.mean(x)
  mean_y = np.mean(y)
  length = len(x)
  for i in range(length):
    sum += (x[i] - mean_x)*(y[i] - mean_y)
  cov = sum / (length-1)
  return cov

def cov_mat(X): # generates covariance matrix for n features where n is the number of features.
  n = X.shape[1]
  cov_matrix = np.zeros((n,n))
  for i in range(n):
    for j in range(n):
      cov_matrix[i, j] = covariance(X[:, i], X[:, j])
  return cov_matrix

In [5]:
from scipy.linalg import eig # to calculate eigen values and eigen vectors

In [6]:
class PCA:
  def __init__(self, n_dimensions):
    self.n_dimensions = n_dimensions
    self.covariance_matrix = None
    self.principal_components = None
    self.explained_variance = None

  def fit(self, x):
    cov_matrix = cov_mat(x)
    self.covariance_matrix = cov_matrix
    eigenvalues, eigenvectors = eig(cov_matrix) # finds eigen values and vectors
    eigenvectors = eigenvectors.T # eigen vectors  are column-wise hence doing transpose of it to make it easier for future purposes.

    index = np.argsort(eigenvalues)[::-1] # sorting in descending order the eigen vectors according to corresponding eigen values.
    eigenvalues = eigenvalues[index]
    eigenvectors = eigenvectors[index]

    principle_comps = []
    explained_var = [] # explained variance ratio
    for i in range(self.n_dimensions):
      principle_comps.append(eigenvectors[i]) # collecting the principal components required
      explained_var.append(eigenvalues[i] / np.sum(eigenvalues)) # storing explained variance for each principal component

    self.principal_components = np.array(principle_comps)
    self.explained_variance = np.array(explained_var)

  def transform(self, x):
    X_transformed = np.dot(x, self.principal_components.T) # taking dot product to transform the original data
    return X_transformed