In [95]:
import numpy as np
from scipy import linalg as LA

def project_dataset(X, k):
    print('**************')

    #centering the data
    mu = np.mean(X, axis = 0)
    print('mu: ' + str(mu))
    centered_X = X - mu  
    print('centered data:\n ' + str(centered_X))

    Sigma = np.cov(centered_X, rowvar = False)

    print('***** SVD decomposition ****')

    U, S, V = LA.svd(Sigma)
    print('U:\n' + str(U))
    print('S:\n' + str(S))
    print('V:\n' + str(V))

    #Verifying that eivenvector are indeed unit vectors
    print('Norms of eigenvectors (columns of U):')
    print(np.linalg.norm(U[:,0]), np.linalg.norm(U[:,1]), np.linalg.norm(U[:,2]))

    print('Percentages of variation:\n', str(S/np.sum(S) * 100))

    # If we wish to reduce dimensionality to k, we first need to compute U_redux
    U_redux = U[:, 0:k]
    print('U_redux:\n', str(U_redux))

    #Now we compute Z, the matrix of projected points in k-dimensional space.
    Z = np.matmul(U_redux.T,X.T)
    print('Projected dataset (k=' + str(k) + '):\n' + str(Z.T))
    
    return U_redux, Z

def reconstruct_data(U_redux, Z):
    print('Reconstructed dataset:\n' + str(np.matmul(U_redux, Z).T))

In [78]:
X = np.array([
        [0.387,  4878, 5.42],
        [0.723, 12104, 5.25],
        [1,     12756, 5.52],
        [1.524,  6787, 3.94],
    ])

project_dataset(X, 2)

**************
mu: [9.08500e-01 9.13125e+03 5.03250e+00]
centered data:
 [[-5.21500e-01 -4.25325e+03  3.87500e-01]
 [-1.85500e-01  2.97275e+03  2.17500e-01]
 [ 9.15000e-02  3.62475e+03  4.87500e-01]
 [ 6.15500e-01 -2.34425e+03 -1.09250e+00]]
***** SVD decomposition ****
U:
[[-1.21901390e-05  5.66460727e-01  8.24088736e-01]
 [-9.99999997e-01  5.32639789e-05 -5.14047691e-05]
 [-7.30130279e-05 -8.24088734e-01  5.66460725e-01]]
S:
[1.51872330e+07 6.70619604e-01 2.02485956e-02]
V:
[[-1.21901390e-05 -9.99999997e-01 -7.30130279e-05]
 [ 5.66460727e-01  5.32639789e-05 -8.24088734e-01]
 [ 8.24088736e-01 -5.14047691e-05  5.66460725e-01]]
Norms of eigenvectors (columns of U):
0.9999999999999998 1.0 0.9999999999999997
Percentages of variation:
 [9.99999955e+01 4.41567976e-06 1.33326424e-07]
U_redux:
 [[-1.21901390e-05  5.66460727e-01]
 [-9.99999997e-01  5.32639789e-05]
 [-7.30130279e-05 -8.24088734e-01]]
Projected dataset:
[[-4.87800039e+03 -3.98751895e+00]
 [-1.21040004e+04 -3.27220755e+00]
 [-1.2

(array([[-1.21901390e-05,  5.66460727e-01],
        [-9.99999997e-01,  5.32639789e-05],
        [-7.30130279e-05, -8.24088734e-01]]),
 array([[-4.87800039e+03, -1.21040004e+04, -1.27560004e+04,
         -6.78700029e+03],
        [-3.98751895e+00, -3.27220755e+00, -3.30307377e+00,
         -2.02212084e+00]]))

In [94]:
X = np.array([
        [1,  -2, -.05],
        [2, 4, +.05],
        [4, -8, -.1],
        [8,  16, +.1],
    ])
U_redux, Z = project_dataset(X, k=2)

reconstruct_data(U_redux, Z)

**************
mu: [3.75 2.5  0.  ]
centered data:
 [[ -2.75  -4.5   -0.05]
 [ -1.75   1.5    0.05]
 [  0.25 -10.5   -0.1 ]
 [  4.25  13.5    0.1 ]]
***** SVD decomposition ****
U:
[[-0.21011096  0.97765515  0.00661783]
 [-0.97764219 -0.21004099 -0.00992594]
 [-0.00831413 -0.00855542  0.99992884]]
S:
[1.09628348e+02 4.96292646e+00 3.92101053e-04]
V:
[[-0.21011096 -0.97764219 -0.00831413]
 [ 0.97765515 -0.21004099 -0.00855542]
 [ 0.00661783 -0.00992594  0.99992884]]
Norms of eigenvectors (columns of U):
1.0000000000000002 1.0 0.9999999999999999
Percentages of variation:
 [9.56686915e+01 4.33096630e+00 3.42172398e-04]
U_redux:
 [[-0.21011096  0.97765515]
 [-0.97764219 -0.21004099]
 [-0.00831413 -0.00855542]]
Projected dataset (k=2):
[[  1.74558913   1.3981649 ]
 [ -4.33120641   1.11471857]
 [  6.98152512   5.59180404]
 [-17.32399421   4.45972983]]
Reconstructed dataset
: [[ 1.0001557  -2.00023352 -0.02647493]
 [ 1.99984429  4.00023354  0.02647331]
 [ 3.99996105 -7.99994158 -0.1058855 ]
 

What do you expect the projected datasets would be for the data matrix X? Explain the results for k=3 and k = 1. Compare with the results obtained in the cell above.

In [85]:
X1 = np.array([
        [1,   2, -.05],
        [2,   4, -.05],
        [4,   8, -.05],
        [8,  16, -.05],
    ])
project_dataset(X=X1, k=3)
project_dataset(X=X1, k=1)

**************
mu: [ 3.75  7.5  -0.05]
centered data:
 [[-2.75 -5.5   0.  ]
 [-1.75 -3.5   0.  ]
 [ 0.25  0.5   0.  ]
 [ 4.25  8.5   0.  ]]
***** SVD decomposition ****
U:
[[-0.4472136  -0.89442719  0.        ]
 [-0.89442719  0.4472136   0.        ]
 [ 0.          0.          1.        ]]
S:
[4.79166667e+01 6.35528743e-15 0.00000000e+00]
V:
[[-0.4472136  -0.89442719 -0.        ]
 [-0.89442719  0.4472136   0.        ]
 [ 0.          0.          1.        ]]
Norms of eigenvectors (columns of U):
0.9999999999999998 0.9999999999999999 1.0
Percentages of variation:
 [1.00000000e+02 1.32632086e-14 0.00000000e+00]
U_redux:
 [[-0.4472136  -0.89442719  0.        ]
 [-0.89442719  0.4472136   0.        ]
 [ 0.          0.          1.        ]]
Projected dataset (k=3):
[[-2.23606798e+00 -1.11022302e-16 -5.00000000e-02]
 [-4.47213595e+00 -2.22044605e-16 -5.00000000e-02]
 [-8.94427191e+00 -4.44089210e-16 -5.00000000e-02]
 [-1.78885438e+01 -8.88178420e-16 -5.00000000e-02]]
**************
mu: [ 3.75  

(array([[-0.4472136 ],
        [-0.89442719],
        [ 0.        ]]),
 array([[ -2.23606798,  -4.47213595,  -8.94427191, -17.88854382]]))