Generating summary statistics for ADNI and NACC, including Table 1 in the manuscript.

In [None]:
import pandas as pd
import numpy as np
import os
from scipy.stats import chi2_contingency, mannwhitneyu
import statsmodels.stats.api as sms

## ADNI

In [None]:
adni = pd.read_csv("", index_col=0)
a_labels = pd.read_csv("", index_col=0)

adni = a_labels.join(adni)

In [None]:
num_feats = ["MMSE", "FAQ", "CDRSB", "AGE", "PTEDUCAT"]
cat_feats = ["APOE4", "PTGENDER_Male"]

In [None]:
sumstats = pd.DataFrame(index=["Slow", "Fast"], columns=["N"] + num_feats)
fast = adni[adni["Cluster"]==1][::]
slow = adni[adni["Cluster"]==2][::]

# fill table
sumstats.loc["Fast", "N"] = fast.shape[0]
sumstats.loc["Slow", "N"] = slow.shape[0]

sumstats.loc["Fast", num_feats] = fast[num_feats].mean().round(2).map(str) + " ± " + fast[num_feats].std().round(2).map(str)
sumstats.loc["Slow", num_feats] = slow[num_feats].mean().round(2).map(str) + " ± " + slow[num_feats].std().round(2).map(str)

# cats
propS = len(slow[slow["APOE4"]!=0]) / len(slow)
propF = len(fast[fast["APOE4"]!=0]) / len(fast)
sumstats.loc["Slow", "APOE4"] = np.round(len(slow[slow["APOE4"]!=0]) / len(slow) *100, 2)
sumstats.loc["Fast", "APOE4"] = np.round(len(fast[fast["APOE4"]!=0]) / len(fast) *100, 2)
sumstats.loc["Difference [95% CI]", "APOE4"] = f"{np.round((propF-propS)*100, 2)} " + str(np.round(np.multiply(sms.confint_proportions_2indep(
                                                         len(slow[slow["APOE4"]!=0]), len(slow),
                                                         len(fast[fast["APOE4"]!=0]), len(fast)), 100), 2))

propS = len(slow[slow["PTGENDER_Male"]!=0]) / len(slow)
propF = len(fast[fast["PTGENDER_Male"]!=0]) / len(fast)
sumstats.loc["Slow", "Female"] = np.round(len(slow[slow["PTGENDER_Male"]==0]) / len(slow) *100, 2)
sumstats.loc["Fast", "Female"] = np.round(len(fast[fast["PTGENDER_Male"]==0]) / len(fast) *100, 2)

sumstats.loc["Difference [95% CI]", "Female"] = f"{np.round((propF-propS)*100, 2)} " + str(np.round(np.multiply(sms.confint_proportions_2indep(
                                                         len(slow[slow["PTGENDER_Male"]!=0]), len(slow),
                                                         len(fast[fast["PTGENDER_Male"]!=0]), len(fast)), 100), 2))

In [None]:
for feat in num_feats:
    sumstats.loc["p-value", feat] = np.round(mannwhitneyu(slow[feat],fast[feat], nan_policy='omit').pvalue, 2)

sumstats.loc["p-value", "Female"] = np.round(chi2_contingency(pd.crosstab(adni["Cluster"], adni["PTGENDER_Male"]))[1], 2)
sumstats.loc["p-value", "APOE4"] = np.round(chi2_contingency(pd.crosstab(adni["Cluster"], adni["APOE4"]!=0))[1], 2)

## NACC

In [None]:
n_labels = pd.read_csv("", index_col=0).rename(columns={"0":"Cluster"})
nacc = pd.read_csv("", index_col=0)
nacc = n_labels.join(nacc)

In [None]:
nacc_stats = pd.DataFrame(index=["Slow", "Fast"], columns=["N"] + num_feats)
fast = nacc[nacc["Cluster"]==1][::]
slow = nacc[nacc["Cluster"]==2][::]

# fill table
nacc_stats.loc["Fast", "N"] = fast.shape[0]
nacc_stats.loc["Slow", "N"] = slow.shape[0]

nacc_stats.loc["Fast", num_feats] = fast[num_feats].mean().round(2).map(str) + " ± " + fast[num_feats].std().round(2).map(str)
nacc_stats.loc["Slow", num_feats] = slow[num_feats].mean().round(2).map(str) + " ± " + slow[num_feats].std().round(2).map(str)

# cats
propS = len(slow[slow["APOE4"]!=0]) / len(slow)
propF = len(fast[fast["APOE4"]!=0]) / len(fast)
nacc_stats.loc["Slow", "APOE4"] = np.round(len(slow[slow["APOE4"]!=0]) / len(slow) *100, 2)
nacc_stats.loc["Fast", "APOE4"] = np.round(len(fast[fast["APOE4"]!=0]) / len(fast) *100, 2)
nacc_stats.loc["Difference [95% CI]", "APOE4"] = f"{np.round((propF-propS)*100, 2)} " + str(np.round(np.multiply(sms.confint_proportions_2indep(
                                                         len(slow[slow["APOE4"]!=0]), len(slow),
                                                         len(fast[fast["APOE4"]!=0]), len(fast)), 100), 2))

propS = len(slow[slow["PTGENDER_Male"]!=0]) / len(slow)
propF = len(fast[fast["PTGENDER_Male"]!=0]) / len(fast)
nacc_stats.loc["Slow", "Female"] = np.round(len(slow[slow["PTGENDER_Male"]==0]) / len(slow) *100, 2)
nacc_stats.loc["Fast", "Female"] = np.round(len(fast[fast["PTGENDER_Male"]==0]) / len(fast) *100, 2)

nacc_stats.loc["Difference [95% CI]", "Female"] = f"{np.round((propF-propS)*100, 2)} " + str(np.round(np.multiply(sms.confint_proportions_2indep(
                                                         len(slow[slow["PTGENDER_Male"]!=0]), len(slow),
                                                         len(fast[fast["PTGENDER_Male"]!=0]), len(fast)), 100), 2))

In [None]:
for feat in num_feats:
    nacc_stats.loc["p-value", feat] = np.round(mannwhitneyu(slow[feat],fast[feat], nan_policy='omit').pvalue, 2)

nacc_stats.loc["p-value", "Female"] = np.round(chi2_contingency(pd.crosstab(nacc["Cluster"], nacc["PTGENDER_Male"]))[1], 2)
nacc_stats.loc["p-value", "APOE4"] = np.round(chi2_contingency(pd.crosstab(nacc["Cluster"], nacc["APOE4"]!=0))[1], 2)

### Difference in means CIs

In [None]:
for cohort, name, table in zip([adni, nacc], ["ADNI", "NACC"], [sumstats, nacc_stats]):
    for feat in num_feats:
        a = sms.DescrStatsW(cohort.loc[cohort["Cluster"]==2, feat].dropna())
        b = sms.DescrStatsW(cohort.loc[cohort["Cluster"]==1, feat].dropna())
        diff = a.mean - b.mean
        cm = sms.CompareMeans(b, a)
        table.loc["Difference [95% CI]", feat] = f"{np.round(diff, 2)} {str(np.round(cm.tconfint_diff(), 2)).replace('  ', ', ')}"

In [None]:
pd.concat([sumstats, nacc_stats]).to_csv("summary_stats.csv")
pd.concat([sumstats, nacc_stats])

### Longitudinal Follow-up

In [None]:
adni_file = ""
adni_time = ["0", "12", "24", "36"]
adni_files = ['cdrsb_norm.csv', 'mmse_norm.csv', 'faq_norm.csv']
adni_data = [pd.read_csv(os.path.join(adni_file, csv), index_col=0) for csv in adni_files]

In [None]:
nacc_file = ""
nacc_time = ["0", "1", "2", "3"]
nacc_files = ['cdr_ADNI_norm.csv', 'mmse_ADNI_norm.csv', 'faq_ADNI_norm.csv']
nacc_data = [pd.read_csv(os.path.join(nacc_file, csv), index_col=0) for csv in nacc_files]

In [None]:
f_up = {pat:0 for pat in adni_data[0].index}
for a in adni_data:
    a = a.transpose()
    for pat in a.columns:
        if a[pat].dropna().index.map(int).max() > f_up[pat]:
            f_up[pat] = a[pat].dropna().index.map(int).max()

In [None]:
print("ADNI median follow-up:", np.median(list(f_up.values())))

In [None]:
f_up = {pat:0 for pat in nacc_data[0].index}
for a in nacc_data:
    a = a.transpose()
    for pat in a.columns:
        if a[pat].dropna().index.map(int).max() > f_up[pat]:
            f_up[pat] = a[pat].dropna().index.map(int).max()

In [None]:
print("NACC median follow-up:", np.median(list(f_up.values())))