In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import os
import time

In [8]:
# Load bilteral cortical thickness values
# For the participants used in trajectory modeling

X = [] # cortical thickness

for filename in os.listdir("ADNI1_tm/"):
    ct = np.loadtxt("ADNI1_tm/"+filename, delimiter='\n')
    X.append(ct)

In [30]:
# Test sklearn PCA implementation

n_features = 10
n_examples = 5
dummy_X = np.random.randn(n_examples, n_features)
pca = PCA(n_components=0.95, svd_solver='full')
pca.fit(dummy_X)
print(pca.components_.shape)
print(pca.explained_variance_)
#print(pca.explained_variance_ratio_)
#print(pca.singular_values_)
#print(pca.mean_)
#print(pca.n_components_)
print(pca.noise_variance_)

pc = pca.components_
dummy_X_reduced = dummy_X @ pc.T

print(dummy_X.shape)
print(dummy_X_reduced.shape)

(4, 10)
[7.27057816 2.83333354 2.28478342 0.75158022]
2.5768942655901333e-32
(5, 10)
(5, 4)


In [38]:
n_components = [50, 100, 200, 500, 1000]
X = np.array(X)
X_reduced = {}
tstart = time.time()
X_test = X[:, :1000]
for n in n_components:
    pca = PCA(n_components=n)
    pca.fit(X_test)
    pc = pca.components_
    X_reduced[n] = X_test @ pc.T
    print("Time required with {} features, {} examples and {} components: {}".format(
        X_test.shape[1], X_test.shape[0], n, time.time() - tstart))
    tstart = time.time()

Time required with 1000 features, 114 examples and 50 components: 0.02219676971435547
Time required with 1000 features, 114 examples and 100 components: 0.016184568405151367
Time required with 1000 features, 114 examples and 200 components: 0.016007423400878906
Time required with 1000 features, 114 examples and 500 components: 0.016399383544921875
Time required with 1000 features, 114 examples and 1000 components: 0.016114234924316406


In [41]:
X_test = X[:, :10000]
tstart = time.time()
for n in n_components:
    pca = PCA(n_components=n)
    pca.fit(X_test)
    pc = pca.components_
    X_reduced[n] = X_test @ pc.T
    print("Time required with {} features, {} examples and {} components: {}".format(
        X_test.shape[1], X_test.shape[0], n, time.time() - tstart))
    tstart = time.time()

Time required with 10000 features, 114 examples and 50 components: 0.19234442710876465
Time required with 10000 features, 114 examples and 100 components: 0.14414405822753906
Time required with 10000 features, 114 examples and 200 components: 0.14352917671203613
Time required with 10000 features, 114 examples and 500 components: 0.12734150886535645
Time required with 10000 features, 114 examples and 1000 components: 0.14240145683288574


In [42]:
tstart = time.time()
for n in n_components:
    pca = PCA(n_components=n)
    pca.fit(X)
    pc = pca.components_
    X_reduced[n] = X @ pc.T
    print("Time required with {} features, {} examples and {} components: {}".format(
        X.shape[1], X.shape[0], n, time.time() - tstart))
    tstart = time.time()

Time required with 81924 features, 114 examples and 50 components: 1.554361343383789
Time required with 81924 features, 114 examples and 100 components: 2.107666492462158
Time required with 81924 features, 114 examples and 200 components: 2.1994049549102783
Time required with 81924 features, 114 examples and 500 components: 2.172401189804077
Time required with 81924 features, 114 examples and 1000 components: 2.319671392440796


In [46]:
frac_variance = [0.8, 0.9, 0.95, 0.99]
tstart = time.time()
X_test = X[:, :1000]
for f in frac_variance:
    pca = PCA(n_components=f, svd_solver='full')
    pca.fit(X_test)
    pc = pca.components_
    X_reduced[n] = X_test @ pc.T
    print("Time required with {} features, {} examples, and {:.0f}% of the variance: {}".format(
        X_test.shape[1], X_test.shape[0], f*100, time.time() - tstart))
    print("Number of components with {} features and {:.0f}% of the variance: {}".format(
        X_test.shape[1], f*100, pca.n_components_))
    tstart = time.time()

Time required with 1000 features, 114 examples, and 80% of the variance: 0.019865036010742188
Number of components with 1000 features and 80% of the variance: 15
Time required with 1000 features, 114 examples, and 90% of the variance: 0.01799750328063965
Number of components with 1000 features and 90% of the variance: 30
Time required with 1000 features, 114 examples, and 95% of the variance: 0.018140554428100586
Number of components with 1000 features and 95% of the variance: 45
Time required with 1000 features, 114 examples, and 99% of the variance: 0.016649723052978516
Number of components with 1000 features and 99% of the variance: 78


In [47]:
tstart = time.time()
X_test = X[:, :10000]
for f in frac_variance:
    pca = PCA(n_components=f, svd_solver='full')
    pca.fit(X_test)
    pc = pca.components_
    X_reduced[n] = X_test @ pc.T
    print("Time required with {} features, {} examples, and {:.0f}% of the variance: {}".format(
        X_test.shape[1], X_test.shape[0], f*100, time.time() - tstart))
    print("Number of components with {} features and {:.0f}% of the variance: {}".format(
        X_test.shape[1], f*100, pca.n_components_))
    tstart = time.time()

Time required with 10000 features, 114 examples, and 80% of the variance: 0.14493727684020996
Number of components with 10000 features and 80% of the variance: 20
Time required with 10000 features, 114 examples, and 90% of the variance: 0.13264703750610352
Number of components with 10000 features and 90% of the variance: 36
Time required with 10000 features, 114 examples, and 95% of the variance: 0.13455867767333984
Number of components with 10000 features and 95% of the variance: 51
Time required with 10000 features, 114 examples, and 99% of the variance: 0.14271950721740723
Number of components with 10000 features and 99% of the variance: 83


In [54]:
# Note: the number of examples limits the number of components in PCA
# This means that you can only keep a very small number of features if you use the
# trajectory modeling subjects as a proxy for feature selection

frac_variance = [.99, .999, .9999, .99999999999]
tstart = time.time()
for f in frac_variance:
    pca = PCA(n_components=f, svd_solver='full')
    pca.fit(X)
    pc = pca.components_
    #X_reduced[n] = X @ pc.T
    print("Time required with {} features, {} examples, and {:.0f}% of the variance: {}".format(
        X.shape[1], X.shape[0], f*100, time.time() - tstart))
    print("Number of components with {} features and {:.0f}% of the variance: {}".format(
        X.shape[1], f*100, pca.n_components_))
    tstart = time.time()

Time required with 81924 features, 114 examples, and 99% of the variance: 2.120217800140381
Number of components with 81924 features and 99% of the variance: 92
Time required with 81924 features, 114 examples, and 100% of the variance: 2.044703960418701
Number of components with 81924 features and 100% of the variance: 109
Time required with 81924 features, 114 examples, and 100% of the variance: 2.0641725063323975
Number of components with 81924 features and 100% of the variance: 112
Time required with 81924 features, 114 examples, and 100% of the variance: 2.0781493186950684
Number of components with 81924 features and 100% of the variance: 112


In [55]:
# Save the components
# To transform new examples into the reduced feature space: X @ pc.T

pca = PCA(n_components=100)
pca.fit(X)
pc = pca.components_
np.save("pcs_from_trajectory_modeling_subjects.npy", pc)

In [6]:
data = [20, 36, 51, 83]
df = pd.DataFrame(data, index=[80, 90, 95, 99])
plt.figure(); df.plot()
plt.title("X = NxM \n N = 114 subjects \n M = 81924 features");
plt.xlabel('Variance explained (%)'); plt.ylabel('Number of components'); 
plt.savefig('var_n_components.png')