<a href="https://colab.research.google.com/github/EdWu-datascience/natural-language-proprecessing/blob/main/inside_PCA_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np

In [6]:
def compute_pca(X, n_components=2):
    """
    Input:
        X: of dimension (m,n) where each row corresponds to a word vector
        n_components: Number of components you want to keep.
    Output:
        X_reduced: data transformed in 2 dims/columns + regenerated original data
    """
    # mean center the data
    X_demeaned = X-np.mean(X,axis=0)
    # calculate the covariance matrix
    covariance_matrix = np.cov(X_demeaned,rowvar=False)#by defalut rowvar = True(each row represent a variable,with observation in columns)
    print('covariance_matrix shape is:',covariance_matrix.shape)
    print(covariance_matrix)
    #in our case, each row represent a sentence which is an observation
    # calculate eigenvectors & eigenvalues of the covariance matrix
    eigen_vals, eigen_vecs = np.linalg.eigh(covariance_matrix)
    
    # sort eigenvalue in increasing order (get the indices from the sort)
    idx_sorted = np.argsort(eigen_vals)
  
    # reverse the order so that it's from highest to lowest.
    idx_sorted_decreasing = idx_sorted[::-1]
    
    # sort the eigen values by idx_sorted_decreasing
    eigen_vals_sorted = eigen_vals[idx_sorted_decreasing]

    # sort eigenvectors using the idx_sorted_decreasing indices
    eigen_vecs_sorted = eigen_vecs[:,idx_sorted_decreasing]
    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    eigen_vecs_subset = eigen_vecs_sorted[:,0:n_components]

    # transform the data by multiplying the transpose of the eigenvectors 
    # with the transpose of the de-meaned data
    # Then take the transpose of that product.
    X_reduced = np.dot(X_demeaned,eigen_vecs_subset)
    

    return X_reduced

In [7]:
x = np.array([[0, 3], [2, 2]])
print('x is ',x)
y = np.array([1,0]).reshape(1,2)
print('y is :',y)
print('sorted is :',x[:,y])

x is  [[0 3]
 [2 2]]
y is : [[1 0]]
sorted is : [[[3 0]]

 [[2 2]]]


In [8]:
x = np.array([[0, 2], [1, 1], [2, 0]]).T
print(x)
print(np.cov(x,rowvar=False))
#print(cov())

[[0 1 2]
 [2 1 0]]
[[ 2.  0. -2.]
 [ 0.  0.  0.]
 [-2.  0.  2.]]


In [9]:
x = np.array([[0, 4], [0, 2]])
print(x)
print(np.mean(x,axis=0))

[[0 4]
 [0 2]]
[0. 3.]


In [10]:
# Testing your function
np.random.seed(1)
X = np.random.rand(3, 10)
X_reduced = compute_pca(X, n_components=2)
print("Your original matrix was " + str(X.shape) + " and it became:")
print(X_reduced)
print(X_reduced.shape)

covariance_matrix shape is: (10, 10)
[[ 4.88046950e-02  3.38429177e-02  2.70410354e-02  1.33348089e-02
   1.00609001e-01  6.57707762e-02 -2.75184938e-02 -5.25695002e-02
  -1.27339493e-02  6.48227388e-02]
 [ 3.38429177e-02  2.38029956e-02  1.68919133e-02  3.98205302e-03
   7.08994547e-02  4.03429340e-02 -2.12082910e-02 -3.84257778e-02
  -6.48868839e-03  4.80954113e-02]
 [ 2.70410354e-02  1.68919133e-02  2.52986469e-02  3.65993232e-02
   4.94545211e-02  6.56528268e-02 -3.45131361e-03 -1.81844337e-02
  -2.00468908e-02  1.84664070e-02]
 [ 1.33348089e-02  3.98205302e-03  3.65993232e-02  8.63566931e-02
   9.67985927e-03  1.00685091e-01  2.58818405e-02  1.66212907e-02
  -4.02656116e-02 -3.16988513e-02]
 [ 1.00609001e-01  7.08994547e-02  4.94545211e-02  9.67985927e-03
   2.11236189e-01  1.17774282e-01 -6.39199532e-02 -1.15041459e-01
  -1.83299257e-02  1.44268308e-01]
 [ 6.57707762e-02  4.03429340e-02  6.56528268e-02  1.00685091e-01
   1.17774282e-01  1.71350909e-01 -3.68356893e-03 -3.98590657e