In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import pickle
import time
import os

In [7]:
# Generate new data, if needed

X_baseline = [] # baseline time point
for filename in sorted(os.listdir("ADNIMERGE/baseline/")):
    ct = np.loadtxt("ADNIMERGE/baseline/"+filename, delimiter='\n')
    X_baseline.append(ct)
    
X_followup = [] # other longitudinal time point
for filename in sorted(os.listdir("ADNIMERGE/followup/")):
    ct = np.loadtxt("ADNIMERGE/followup/"+filename, delimiter='\n')
    X_followup.append(ct)

print(len(X_baseline), len(X_followup))
np.save("X_baseline.npy", np.array(X_baseline))
np.save("X_followup.npy", np.array(X_followup))

1060 1060


In [9]:
# Load the data

df = pd.read_csv("Exp_502_602_combined.csv")
sub_list = df[df["STATUS"].isin(["OK", "PrevTP", "New"])]
X_baseline = np.load("X_baseline.npy")
X_followup = np.load("X_followup.npy")
X_diff = X_followup - X_baseline # vertex-wise CT change
n_subjects = sub_list.shape[0]
print(n_subjects)
print(X_baseline.shape)
print(X_followup.shape)

1060
(1060, 81924)
(1060, 81924)


In [14]:
# Generate the train-test splits (old script)

frac_train = 0.8
ncv = 10
splits = {"train": [], "test": []}

for i in range(ncv):
    indices = list(range(n_subjects))
    random.shuffle(indices)
    train_split = indices[:int(0.8*len(indices))]
    valid_split = indices[int(0.8*len(indices)):]
    splits["train"].append(train_split)
    splits["test"].append(valid_split)
    
splits = pd.DataFrame.from_dict(splits)
splits.to_pickle("train_test_splits.pkl")

In [10]:
# Generate the train-test splits (modified)
# Splits balanced with respect to trajectory classes and subgroups

ncv = 10
frac_test = 1/ncv
splits = {'MMSE': {'train': [], 'test': []},
          'ADAS13': {'train': [], 'test': []}}
indices = {'MMSE': {'T1': {'BE': [], 'FE': [], 'CC': []},
                    'T2': {'BE': [], 'FE': [], 'CC': []}},
           'ADAS13': {'T1': {'BE': [], 'FE': [], 'CC': []},
                      'T2': {'BE': [], 'FE': [], 'CC': []},
                      'T3': {'BE': [], 'FE': [], 'CC': []}}}

sub_list = sub_list.reset_index(drop=True)

# find the trajectory and subgroup of each index
for index, row in sub_list.iterrows():
    mmse_traj = row['MMSE_2c_traj']
    mmse_group = row['MMSE_gr']
    indices['MMSE']['T{}'.format(mmse_traj+1)][mmse_group].append(index)
    
    adas_traj = row['ADAS_3c_traj']
    adas_group = row['ADAS13_gr']
    indices['ADAS13']['T{}'.format(adas_traj+1)][adas_group].append(index)

splits_mmse = {'train': [], 'test': []}
splits_adas = {'train': [], 'test': []}
    
# partition into balanced train and test splits
for i in indices.keys():
    train_split = [[] for cv_iter in range(ncv)]
    test_split = [[] for cv_iter in range(ncv)]
    
    for j in indices[i].keys():
        for k in indices[i][j].keys():
            random.shuffle(indices[i][j][k])
            length = len(indices[i][j][k])
            
            for cv_iter in range(ncv):
                start = int(length*cv_iter*frac_test)
                end = int(length*(cv_iter+1)*frac_test)
                test_split[cv_iter].extend(indices[i][j][k][start:end])
                before = indices[i][j][k][:start]
                after = indices[i][j][k][end:]
                train_split[cv_iter].extend(before + after)
    
    for cv_iter in range(ncv):
        random.shuffle(train_split[cv_iter])
        random.shuffle(test_split[cv_iter])
    
    splits[i]['train'] = train_split
    splits[i]['test'] = test_split
        
splits = pd.DataFrame.from_dict(splits)
splits.to_pickle("train_test_splits.pkl")

In [12]:
# Reduce the data with PCA
from sklearn.decomposition import PCA

n_components = 78 # same number of components as AAL
splits = pd.read_pickle("train_test_splits.pkl")

for t in ['MMSE', 'ADAS13']:
    for i in range(ncv):
        print("Reducing data, {} trajectory, fold {}".format(t, i))
        train_split = splits[t]['train'][i]
        test_split = splits[t]['test'][i]
        X_train_baseline = X_baseline[train_split]
        X_test_baseline = X_baseline[test_split]
        X_train_followup = X_followup[train_split]
        X_test_followup = X_followup[test_split]
        X_train_diff = X_diff[train_split]

        tstart = time.time()
        pca = PCA(n_components=n_components)
        pca.fit(X_train_diff)
        X_bl_train_reduced = pca.transform(X_train_baseline)
        X_bl_test_reduced = pca.transform(X_test_baseline)
        X_vartp_train_reduced = pca.transform(X_train_followup)
        X_vartp_test_reduced = pca.transform(X_test_followup)
        print("Time required : {}".format(time.time() - tstart))
        np.save("data/PCA_bl_train_{}_cv{}.npy".format(t,i), X_bl_train_reduced)
        np.save("data/PCA_bl_test_{}_cv{}.npy".format(t,i), X_bl_test_reduced)
        np.save("data/PCA_vartp_train_{}_cv{}.npy".format(t,i), X_vartp_train_reduced)
        np.save("data/PCA_vartp_test_{}_cv{}.npy".format(t,i), X_vartp_test_reduced)

Reducing data, MMSE trajectory, fold 0
Time required : 13.250328302383423
Reducing data, MMSE trajectory, fold 1
Time required : 13.206482648849487
Reducing data, MMSE trajectory, fold 2
Time required : 13.719436645507812
Reducing data, MMSE trajectory, fold 3
Time required : 13.531603574752808
Reducing data, MMSE trajectory, fold 4
Time required : 13.197346448898315
Reducing data, MMSE trajectory, fold 5
Time required : 12.353111982345581
Reducing data, MMSE trajectory, fold 6
Time required : 12.6961350440979
Reducing data, MMSE trajectory, fold 7
Time required : 12.847567081451416
Reducing data, MMSE trajectory, fold 8
Time required : 12.497691869735718
Reducing data, MMSE trajectory, fold 9
Time required : 13.247795343399048
Reducing data, ADAS13 trajectory, fold 0
Time required : 13.132242441177368
Reducing data, ADAS13 trajectory, fold 1
Time required : 12.262178421020508
Reducing data, ADAS13 trajectory, fold 2
Time required : 13.012330293655396
Reducing data, ADAS13 trajectory, 

In [None]:
# Reduce the data with RFE
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

n_features = 78 # same number of components as AAL
splits = pd.read_pickle("train_test_splits.pkl")

for t in ['MMSE', 'ADAS13']:
    for i in range(ncv):
        print("Reducing data, {} trajectory, fold {}".format(t, i))
        train_split = splits[t]['train'][i]
        test_split = splits[t]['test'][i]
        X_train_baseline = X_baseline[train_split]
        X_test_baseline = X_baseline[test_split]
        X_train_followup = X_followup[train_split]
        X_test_followup = X_followup[test_split]
        X_train_diff = X_diff[train_split]

        df_train = sub_list.iloc[train_split]
        if t == 'MMSE':
            y_train = df_train["MMSE_2c_traj"].values
        elif t == 'ADAS13':
            y_train = df_train["ADAS_3c_traj"].values

        tstart = time.time()
        estimator = SVC(kernel='linear')
        rfe = RFE(estimator, n_features, step=0.5)
        rfe.fit(X_train_diff, y_train)
        X_bl_train = rfe.transform(X_train_baseline)
        X_bl_test = rfe.transform(X_test_baseline)
        X_vartp_train = rfe.transform(X_train_followup)
        X_vartp_test = rfe.transform(X_test_followup)
        print("Time required : {}".format(time.time() - tstart))
        np.save("data/RFE_bl_train_{}_cv{}.npy".format(t,i), X_bl_train)
        np.save("data/RFE_bl_test_{}_cv{}.npy".format(t,i), X_bl_test)
        np.save("data/RFE_vartp_train_{}_cv{}.npy".format(t,i), X_vartp_train)
        np.save("data/RFE_vartp_test_{}_cv{}.npy".format(t,i), X_vartp_test)

Reducing data, MMSE trajectory, fold 0
Time required : 158.32730746269226
Reducing data, MMSE trajectory, fold 1
Time required : 158.46583104133606
Reducing data, MMSE trajectory, fold 2
Time required : 151.17745995521545
Reducing data, MMSE trajectory, fold 3
Time required : 155.35781359672546
Reducing data, MMSE trajectory, fold 4
Time required : 167.2562394142151
Reducing data, MMSE trajectory, fold 5
Time required : 149.3921365737915
Reducing data, MMSE trajectory, fold 6
Time required : 153.25627446174622
Reducing data, MMSE trajectory, fold 7
Time required : 158.71924304962158
Reducing data, MMSE trajectory, fold 8
Time required : 171.8065972328186
Reducing data, MMSE trajectory, fold 9
Time required : 149.35567235946655
Reducing data, ADAS13 trajectory, fold 0
Time required : 214.89996123313904
Reducing data, ADAS13 trajectory, fold 1
Time required : 211.3731334209442
Reducing data, ADAS13 trajectory, fold 2
Time required : 222.22451496124268
Reducing data, ADAS13 trajectory, fo

In [8]:
# Reduce the data with RLR
# Idea by Moradi et al., 2015
from sklearn.linear_model import SGDClassifier

n_features = 78 # same number of components as AAL
splits = pd.read_pickle("train_test_splits.pkl")

n_repeats = 5 # 10 by Moradi et al.
range_size = 10 # 100 by Moradi et al.
alpha_range = np.power(10, np.linspace(-5, -2, range_size))
print("Range of alphas: {}".format(alpha_range))

for t in ['MMSE', 'ADAS13']:
    for i in range(ncv):
        train_split = splits[t]["train"][i]
        test_split = splits[t]["test"][i]
        X_train_baseline = X_baseline[train_split]
        X_test_baseline = X_baseline[test_split]
        X_train_followup = X_followup[train_split]
        X_test_followup = X_followup[test_split]
        X_train_diff = X_diff[train_split]

        df_train = sub_list.iloc[train_split]
        if t == 'MMSE':
            y_train = df_train["MMSE_2c_traj"]
        elif t == 'ADAS13':
            y_train = df_train["ADAS_3c_traj"]

        print("Begin {}, CV fold number {}".format(t,i))
        print("Step one: choose the optimal alpha")
        tstart = time.time()
        alpha_star = []

        for j in range(n_repeats):
            print("Beginning repeat {}...".format(j))
            scores = np.zeros(range_size)

            indices = list(range(len(train_split)))
            random.shuffle(indices)
            inner_train = indices[:int(0.9*len(indices))]
            inner_valid = indices[int(0.9*len(indices)):]
            X_diff_inner_train = X_diff[inner_train]
            X_diff_inner_valid = X_diff[inner_valid]
            df_inner_train = df_train.iloc[inner_train]
            df_inner_valid = df_train.iloc[inner_valid]
            if t == 'MMSE':
                y_inner_train = df_inner_train["MMSE_2c_traj"]
                y_inner_valid = df_inner_valid["MMSE_2c_traj"]
            elif t == 'ADAS13':
                y_inner_train = df_inner_train["ADAS_3c_traj"]
                y_inner_valid = df_inner_valid["ADAS_3c_traj"]

            for k in range(range_size):
                clf = SGDClassifier(loss="log", penalty="elasticnet", l1_ratio=0.5, alpha=alpha_range[k])
                clf.fit(X_diff_inner_train, y_inner_train)
                scores[k] = clf.score(X_diff_inner_valid, y_inner_valid)

            alpha_star.append(alpha_range[np.argmax(scores)])

        print("Optimal alpha: {}".format(np.median(alpha_star)))
        print("Step two: find the most significant features")
        k = np.where(alpha_range <= np.median(alpha_star))[0][-1]
        nonzero_features = np.ones(X_baseline.shape[1])

        for alpha in alpha_range[k:]:
            clf = SGDClassifier(loss="log", penalty="elasticnet",
                                l1_ratio=0.5, alpha=alpha)
            clf.fit(X_train_diff, y_train)
            nonzero_features = np.logical_and(nonzero_features, (clf.coef_ > 0))

        nonzero_indices = np.where(nonzero_features)[1]
        X_train_diff_remain = X_train_diff[:, nonzero_indices]
        print("Number of nonzero features: {}".format(nonzero_indices.shape[0]))
        coef_sums = np.zeros(X_train_diff_remain.shape[1])

        for j in range(n_repeats):
            clf = SGDClassifier(loss="log", penalty="l2", alpha=alpha_range[k])
            clf.fit(X_train_diff_remain, y_train)
            coef_sums = coef_sums + clf.coef_

        most_significant = np.argsort(coef_sums)[0, :78]
        features_selected = nonzero_indices[most_significant]

        X_bl_train = X_train_baseline[:, features_selected]
        X_bl_test = X_test_baseline[:, features_selected]
        X_vartp_train = X_train_followup[:, features_selected]
        X_vartp_test = X_test_followup[:, features_selected]
        print("Time required : {}".format(time.time() - tstart))
        np.save("data/RLR_bl_train_{}_cv{}.npy".format(t,i), X_bl_train)
        np.save("data/RLR_bl_test_{}_cv{}.npy".format(t,i), X_bl_test)
        np.save("data/RLR_vartp_train_{}_cv{}.npy".format(t,i), X_vartp_train)
        np.save("data/RLR_vartp_test_{}_cv{}.npy".format(t,i), X_vartp_test)

Range of alphas: [1.00000000e-05 2.15443469e-05 4.64158883e-05 1.00000000e-04
 2.15443469e-04 4.64158883e-04 1.00000000e-03 2.15443469e-03
 4.64158883e-03 1.00000000e-02]
Begin MMSE, CV fold number 0
Step one: choose the optimal alpha
Beginning repeat 0...




Beginning repeat 1...
Beginning repeat 2...
Beginning repeat 3...
Beginning repeat 4...
Optimal alpha: 0.001
Step two: find the most significant features
Number of nonzero features: 698
Time required : 331.4957060813904
Begin MMSE, CV fold number 1
Step one: choose the optimal alpha
Beginning repeat 0...
Beginning repeat 1...
Beginning repeat 2...
Beginning repeat 3...
Beginning repeat 4...
Optimal alpha: 0.0001
Step two: find the most significant features
Number of nonzero features: 808
Time required : 338.7958824634552
Begin MMSE, CV fold number 2
Step one: choose the optimal alpha
Beginning repeat 0...
Beginning repeat 1...
Beginning repeat 2...
Beginning repeat 3...
Beginning repeat 4...
Optimal alpha: 0.001
Step two: find the most significant features
Number of nonzero features: 751
Time required : 254.63292813301086
Begin MMSE, CV fold number 3
Step one: choose the optimal alpha
Beginning repeat 0...
Beginning repeat 1...
Beginning repeat 2...
Beginning repeat 3...
Beginning repe

In [10]:
# Reduce the data with HCA
from sklearn.cluster import AgglomerativeClustering

n_features = 78 # same number of components as AAL
n_partitions = 20 # 2-step HCA process
partition_size = 40962/n_partitions
n_clusters = int(partition_size/n_partitions)
splits = pd.read_pickle("train_test_splits.pkl")

for t in ['MMSE', 'ADAS13']:
    for i in range(ncv):
        train_split = splits[t]["train"][i]
        test_split = splits[t]["test"][i]
        X_train_baseline = X_baseline[train_split]
        X_test_baseline = X_baseline[test_split]
        X_train_followup = X_followup[train_split]
        X_test_followup = X_followup[test_split]
        X_train_diff = X_diff[train_split]

        tstart = time.time()
        print("Beginning HCA, {} trajectory, CV fold number {}".format(t,i))
        print("Clustering left vertices...")

        left_vertices = X_train_diff.T[:40962, :]
        left_vertices_merged = []
        left_cluster_list = []

        for p in range(n_partitions):
            vp = left_vertices[int(p*partition_size):int((p+1)*partition_size)]
            hca = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward")
            clustering = hca.fit(vp)

            for cluster in np.unique(clustering.labels_):
                ind = np.where(clustering.labels_ == cluster)[0]
                left_cluster_list.append(ind + int(p*partition_size))
                left_vertices_merged.append(np.mean(vp[ind, :], axis=0))

        hca = AgglomerativeClustering(n_clusters=int(np.floor(n_features/2)), linkage="ward")
        left_clustering = hca.fit(np.array(left_vertices_merged))

        print("Clustering right vertices...")

        right_vertices = X_train_diff.T[-40962:, :]
        right_vertices_merged = []
        right_cluster_list = []

        for p in range(n_partitions):
            vp = right_vertices[int(p*partition_size):int((p+1)*partition_size)]
            hca = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward")
            clustering = hca.fit(vp)

            for cluster in np.unique(clustering.labels_):
                ind = np.where(clustering.labels_ == cluster)[0]
                right_cluster_list.append(ind + int(p*partition_size))
                right_vertices_merged.append(np.mean(vp[ind, :], axis=0))

        hca = AgglomerativeClustering(n_clusters=int(np.ceil(n_features/2)), linkage="ward")
        right_clustering = hca.fit(np.array(right_vertices_merged))

        print("Reducing the data...")

        X_bl_train_reduced = np.zeros((X_train_baseline.shape[0], n_features))
        X_bl_test_reduced = np.zeros((X_test_baseline.shape[0], n_features))
        X_vartp_train_reduced = np.zeros((X_train_followup.shape[0], n_features))
        X_vartp_test_reduced = np.zeros((X_test_followup.shape[0], n_features))

        for cluster in np.unique(left_clustering.labels_):
            subclusters = np.where(left_clustering.labels_ == cluster)[0]
            ind = np.concatenate([left_cluster_list[sc] for sc in subclusters])
            X_bl_train_reduced[:, cluster] = np.mean(X_train_baseline[:, ind], axis=1)
            X_bl_test_reduced[:, cluster] = np.mean(X_test_baseline[:, ind], axis=1)
            X_vartp_train_reduced[:, cluster] = np.mean(X_train_followup[:, ind], axis=1)
            X_vartp_test_reduced[:, cluster] = np.mean(X_test_followup[:, ind], axis=1)

        shift = int(np.floor(n_features/2))
        for cluster in np.unique(right_clustering.labels_):
            subclusters = np.where(right_clustering.labels_ == cluster)[0]
            ind = np.concatenate([right_cluster_list[sc] for sc in subclusters])
            X_bl_train_reduced[:, cluster + shift] = np.mean(X_train_baseline[:, ind], axis=1)
            X_bl_test_reduced[:, cluster + shift] = np.mean(X_test_baseline[:, ind], axis=1)
            X_vartp_train_reduced[:, cluster + shift] = np.mean(X_train_followup[:, ind], axis=1)
            X_vartp_test_reduced[:, cluster + shift] = np.mean(X_test_followup[:, ind], axis=1)

        print("Time required : {}".format(time.time() - tstart))
        np.save("data/HCA_bl_train_{}_cv{}.npy".format(t,i), X_bl_train_reduced)
        np.save("data/HCA_bl_test_{}_cv{}.npy".format(t,i), X_bl_test_reduced)
        np.save("data/HCA_vartp_train_{}_cv{}.npy".format(t,i), X_vartp_train_reduced)
        np.save("data/HCA_vartp_test_{}_cv{}.npy".format(t,i), X_vartp_test_reduced)

Beginning HCA, MMSE trajectory, CV fold number 0
Clustering left vertices...
Clustering right vertices...
Reducing the data...
Time required : 77.7331600189209
Beginning HCA, MMSE trajectory, CV fold number 1
Clustering left vertices...
Clustering right vertices...
Reducing the data...
Time required : 88.6251437664032
Beginning HCA, MMSE trajectory, CV fold number 2
Clustering left vertices...
Clustering right vertices...
Reducing the data...
Time required : 96.65112638473511
Beginning HCA, MMSE trajectory, CV fold number 3
Clustering left vertices...
Clustering right vertices...
Reducing the data...
Time required : 91.12027621269226
Beginning HCA, MMSE trajectory, CV fold number 4
Clustering left vertices...
Clustering right vertices...
Reducing the data...
Time required : 91.31439709663391
Beginning HCA, MMSE trajectory, CV fold number 5
Clustering left vertices...
Clustering right vertices...
Reducing the data...
Time required : 93.2380793094635
Beginning HCA, MMSE trajectory, CV fo