# Principle Component Analysis

In [37]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Step 1: Create some sample data (replace this with your dataset)
data = pd.read_csv('ex1data1.txt') # 100 samples with 3 features
np.set_printoptions(precision=4, suppress=True)
print("Formatted Array:")
print(data)

Formatted Array:
     6.1101    17.592
0    5.5277   9.13020
1    8.5186  13.66200
2    7.0032  11.85400
3    5.8598   6.82330
4    8.3829  11.88600
..      ...       ...
91   5.8707   7.20290
92   5.3054   1.98690
93   8.2934   0.14454
94  13.3940   9.05510
95   5.4369   0.61705

[96 rows x 2 columns]


In [41]:
# Step 2: Standardize the data
mean = np.mean(data, axis=0)

#std_dev = np.std(data, axis=0)
standardized_data = (data - mean)
print (mean)
print(standardized_data)

6.1101    8.181151
17.592    5.716709
dtype: float64
      6.1101    17.592
0  -2.653451  3.413491
1   0.337449  7.945291
2  -1.177951  6.137291
3  -2.321351  1.106591
4   0.201749  6.169291
..       ...       ...
91 -2.310451  1.486191
92 -2.875751 -3.729809
93  0.112249 -5.572169
94  5.212849  3.338391
95 -2.744251 -5.099659

[96 rows x 2 columns]


In [43]:
# Step 3: Compute the covariance matrix
covariance_matrix = np.cov(standardized_data,rowvar=False)
size_cc = covariance_matrix.size
shape_cc = covariance_matrix.shape
print(size_cc, shape_cc)
print(covariance_matrix)

4 (2, 2)
[[15.089  18.3112]
 [18.3112 29.2135]]


In [45]:
# Step 4: Compute the eigenvalues and eigenvectors of the covariance matrix
eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
print(eigenvalues)
print(eigenvectors)

[ 2.5254 41.7771]
[[-0.8246  0.5658]
 [ 0.5658  0.8246]]


In [47]:
# Step 5: Sort eigenvalues and corresponding eigenvectors in descending order
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:,sorted_indices]
print(eigenvalues)
print(eigenvectors)

[41.7771  2.5254]
[[ 0.5658 -0.8246]
 [ 0.8246  0.5658]]


In [49]:
# Step 6: Choose the number of components (or a threshold for explained variance)
n_components = 1 # Choose the number of principal components

# Step 7: Select the top 'n_components' eigenvectors
selected_eigenvectors = eigenvectors[:, :n_components]

# Step 8: Project the data onto the selected eigenvectors to obtain the principalcomponents
final_result = np.dot(standardized_data, selected_eigenvectors)

# Step 9: Print the final result
print("Final Result afterPCA:")
print(final_result)

Final Result afterPCA:
[[ 1.3135]
 [ 6.7424]
 [ 4.3942]
 [-0.4008]
 [ 5.2012]
 [-1.5271]
 [ 5.4056]
 [-0.2317]
 [-3.3356]
 [-3.4298]
 [11.456 ]
 [-3.4967]
 [ 1.3729]
 [-5.5606]
 [-3.4023]
 [-1.3669]
 [-5.9776]
 [-2.6936]
 [-0.8982]
 [-3.2529]
 [20.074 ]
 [-2.7212]
 [-1.4859]
 [-3.6523]
 [20.0425]
 [ 9.0477]
 [ 2.6671]
 [10.2266]
 [23.1301]
 [-7.3768]
 [-0.6697]
 [ 5.8952]
 [-4.484 ]
 [ 0.698 ]
 [-1.0921]
 [-1.3678]
 [-3.373 ]
 [ 6.2619]
 [-1.2149]
 [-5.8245]
 [-2.2232]
 [ 1.7221]
 [-4.0634]
 [ 0.6354]
 [-4.4667]
 [-2.2408]
 [-4.5397]
 [ 3.8771]
 [-5.3679]
 [ 0.4897]
 [-4.8226]
 [-1.6061]
 [-0.9196]
 [-4.5859]
 [-6.9166]
 [-3.7514]
 [-2.3552]
 [-0.8078]
 [ 0.4704]
 [-0.0868]
 [-7.0246]
 [17.4801]
 [ 9.0313]
 [15.446 ]
 [-1.2304]
 [ 0.0871]
 [ 2.8601]
 [-5.3922]
 [19.4751]
 [ 1.9002]
 [-1.8732]
 [-4.8902]
 [-2.4993]
 [-8.7088]
 [-5.3932]
 [-1.8743]
 [-1.7917]
 [ 2.0382]
 [-4.756 ]
 [-5.7056]
 [-6.2386]
 [-5.187 ]
 [ 2.4045]
 [-1.2498]
 [-1.0263]
 [ 1.4569]
 [-5.1824]
 [-6.0938]
 [-4.1535