# PCA project main hub.
## Will contain both the first and second task plus a little insight into the data.

* Since the data is given as a txt file with it just being numbers there isn't much insight that there can be gotten

Moving straight to task 1:
>We are required to perform PCA on a given dataset.
>
> Given the variable **X**, which is a shape of (n, d)
>>   n -> number of data points
>>
>> d -> number of dimensions in each point.


> The other variable is **var**
>>  the fraction of variance the PCA transformation should maintain

* Returns the matrix **W**, that maintains the **var** fraction

W is a numpy.ndarray of shape (d, nd) where:

    ->nd is the new dimensionality of the transformed X

In [46]:
# Task 0 function
#!/usr/bin/env python3
"""
Performs PCA on a given dataset
"""


import numpy as np


def pca(X, var=0.95):
    """
    Performs PCA on a given dataset
    """
    # computing the covariance matrix of X,
    # rowvar=False ensures that columns represent features (dimensions).
    covariance_matrix = np.cov(X, rowvar=False)
    
    # Performing eigenvalue decomposition,
    # np.linalg.eigh is used for eigenvalue decomposition of symmetric matrices like the covariance matrix. 
    eigen_value, eigen_vector = np.linalg.eigh(covariance_matrix)
    
    # Sort eigenvalues in desc order
    sorted_indices = np.argsort(eigen_value)[: : -1]
    sorted_eigen_value = eigen_value[sorted_indices]
    sorted_eigen_vector = eigen_vector[:, sorted_indices]
    
    # compute the cumulative value of the sorted eigenvalues
    cumulative_variance = np.cumsum(sorted_eigen_value) / np.sum(sorted_eigen_value)
    
    # select minimum number of components to maintain the desired variance
    n_componets = np.argmax(cumulative_variance >= var) + 2
    n_componets = abs(n_componets)
    
    # Make the w matrix with the correct values
    W = sorted_eigen_vector[:, :n_componets]
    
    return W
    

In [47]:
# main func for task 1

np.random.seed(0)
a = np.random.normal(size=50)
b = np.random.normal(size=50)
c = np.random.normal(size=50)
d = 2 * a
e = -5 * b
f = 10 * c

X = np.array([a, b, c, d, e, f]).T
m = X.shape[0]
X_m = X - np.mean(X, axis=0)
W = pca(X_m)
T = np.matmul(X_m, W)
print(T)
X_t = np.matmul(T, W.T)
print(np.sum(np.square(X_m - X_t)) / m)

[[-16.71379391   3.25277063   3.21956297]
 [ 16.22654311  -0.7283969    0.88325252]
 [ 15.05945199   3.81948929   1.97153621]
 [ -7.69814111   5.49561088   4.34581561]
 [ 14.25075197   1.37060228   4.04817187]
 [-16.66888233  -3.77067823  -2.6264981 ]
 [  6.71765183   0.18115089   1.91719288]
 [ 10.20004065  -0.84380128  -0.44754302]
 [-16.93427229   1.72241573  -0.9006236 ]
 [-12.4100987    0.75431367   0.36518129]
 [-16.40464248   1.98431953  -0.34907508]
 [ -6.69439671   1.30624703   2.77438892]
 [ 10.84363895   4.99826372   1.36502623]
 [-17.2656016    7.29822621  -0.63226953]
 [  5.32413372  -0.54822516   0.79075935]
 [ -5.63240657   1.50278876   0.27590797]
 [ -7.63440366   7.72788006   2.58344477]
 [  4.3348786   -2.14969035  -0.61262033]
 [ -3.95417052   4.22254889   0.14601319]
 [ -6.59947069  -1.00867621  -2.29551761]
 [ -0.78942283  -4.15454151  -5.87117533]
 [ 13.62292856   0.40038586   1.36043631]
 [  0.03536684  -5.85950737   1.86196569]
 [-11.1841298    5.20313078  -2.37