In [8]:
import numpy as np

# Generate some sample data (for illustration purposes)
# N = number of data points, D = dimensionality of each data point
N, D = 10, 5
X = np.random.randn(N, D)

# Step 1: Compute sample mean
mean_x = np.mean(X, axis=0)

# Step 2: Compute sample covariance matrix
X_centered = X - mean_x
cov_matrix = np.dot(X_centered.T, X_centered) / (N - 1)

# Step 3: Compute eigenvalues and eigenvectors of the covariance matrix
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Step 4: Reorder eigenvalues in decreasing order and sort the eigenvectors correspondingly
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues_sorted = eigenvalues[sorted_indices]
eigenvectors_sorted = eigenvectors[:, sorted_indices]

# Step 5: Choose the number of components M < D (let's choose M = 2)
M = 3
U = eigenvectors_sorted[:, :M]

# Step 6: Feature extraction (dimensionality reduction)
Z = np.dot(X_centered, U)

# Step 7: Calculate the percentage of variance preserved
total_variance = np.sum(eigenvalues_sorted)   # Total variance in the original data
explained_variance = np.sum(eigenvalues_sorted[:M])  # Variance preserved by the first M components
explained_variance_ratio = explained_variance / total_variance * 100  # Percentage of variance preserved

# Step 8: Generate new data point from reduced space (as before)
alpha = np.random.uniform(-3 * np.sqrt(eigenvalues_sorted[:M]), 3 * np.sqrt(eigenvalues_sorted[:M]), M)
new_data_point = mean_x + np.dot(alpha, U.T)

# Output results
print("Original Data (X):\n", X, "\n")
print("Sample Mean (mean_x):\n", mean_x, "\n")
print("Covariance Matrix (S):\n", cov_matrix, "\n")
print("Eigenvalues:\n", eigenvalues_sorted, "\n")
print("Eigenvectors:\n", eigenvectors_sorted, "\n")
print("Reduced Data (Z):\n", Z, "\n")
print("New Data Point (from reduced space):\n", new_data_point, "\n")
print(f"Variance preserved with {M} components: {explained_variance_ratio:.2f}%", "\n")


Original Data (X):
 [[ 1.44241303  0.01605636  0.7400918   1.26538527  2.6509331 ]
 [-0.84358992  0.16499439  1.70805476 -1.71301736  0.60731668]
 [ 0.5556354   0.65534744 -0.51510721 -0.28400893  0.70313397]
 [ 0.53348719  0.54123288  1.32823097 -0.34199885  0.36246119]
 [ 1.53303417 -0.16290554  0.66797168  0.05886438  1.17643078]
 [ 1.83148071 -1.65678309 -0.12492376  0.91810538 -1.59653488]
 [-0.04543081 -0.47274553 -1.5629633   2.56039238 -0.98091259]
 [-0.51122805 -1.26628722 -0.57555929  0.61416038  0.20320122]
 [-0.27804339 -2.11919249  0.69985153  1.11595715 -0.05237134]
 [-0.50162227 -0.00532582  0.60000483  0.23795441 -2.80148357]] 

Sample Mean (mean_x):
 [ 0.37161361 -0.43056086  0.2965652   0.44317942  0.02721745] 

Covariance Matrix (S):
 [[ 0.92238203  0.01261131 -0.04312913  0.24387876  0.44826003]
 [ 0.01261131  0.88727921  0.22571682 -0.51226587  0.4114122 ]
 [-0.04312913  0.22571682  0.96676928 -0.79095344  0.41351967]
 [ 0.24387876 -0.51226587 -0.79095344  1.316704