In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from statsmodels.stats.multitest import multipletests

%load_ext rpy2.ipython

## Load R Packages

In [2]:
%%R 

suppressPackageStartupMessages({
    library(broom)
    library(rlist)
    library(rstatix)
    library(tidyverse)
})

## Load data needed for this analysis

In [3]:
# Load information about the time-series features
univariate_TS_feature_info = pd.read_csv("data/feature_info/univariate_feature_info.csv")
pairwise_TS_feature_info = pd.read_csv("data/feature_info/pairwise_feature_info.csv").sort_values(by=['Directed'])

# Define SVM kernel type
SVM_kernel = "Linear"

# Name the feature sets
univariate_feature_set = "catch25"
pairwise_feature_set = "pyspi14"

# Load participants included
UCLA_CNP_subjects_to_keep = pd.read_feather("data/time_series_features/UCLA_CNP_filtered_sample_info_catch25_pyspi14.feather")
ABIDE_subjects_to_keep = pd.read_feather("data/time_series_features/ABIDE_filtered_sample_info_catch25_pyspi14.feather")

# Load metadata
UCLA_CNP_metadata = (pd.read_feather("data/input_data/UCLA_CNP_sample_metadata.feather")
                        .assign(Study = "UCLA_CNP")
                        .query("Sample_ID in @UCLA_CNP_subjects_to_keep.Sample_ID"))
ABIDE_metadata = (pd.read_feather("data/input_data/ABIDE_sample_metadata.feather")
                        .assign(Study = "ABIDE")
                        .query("Sample_ID in @ABIDE_subjects_to_keep.Sample_ID"))

# Load head movement 
UCLA_CNP_head_mvmt = (pd.read_table('data/movement_data/UCLA_CNP_Mean_FD_Power.txt', sep=',')
                      .assign(Study = "UCLA_CNP")
                        .query("Sample_ID in @UCLA_CNP_subjects_to_keep.Sample_ID"))
ABIDE_head_mvmt = (pd.read_table('data/movement_data/ABIDE_Mean_FD_Power.txt', sep=',', dtype={'Sample_ID': str,
                                                                                              'Mean_FD_Power': float})
                   .assign(Study = "ABIDE")
                        .query("Sample_ID in @ABIDE_subjects_to_keep.Sample_ID"))

# Merge metadata + head movement
merged_metadata = pd.concat([UCLA_CNP_metadata, ABIDE_metadata], axis=0).merge(pd.concat([UCLA_CNP_head_mvmt, ABIDE_head_mvmt], axis=0))

# Load all results
univariate_balanced_accuracy_results = pd.read_feather('data/classification_results/univariate_balanced_accuracy_results.feather')
univariate_balanced_accuracy_results_all_folds = pd.read_feather('data/classification_results/univariate_balanced_accuracy_results_all_folds.feather')
univariate_p_values = pd.read_feather('data/classification_results/univariate_p_values.feather')

pairwise_balanced_accuracy_results = pd.read_feather('data/classification_results/pairwise_balanced_accuracy_results.feather')
pairwise_balanced_accuracy_results_all_folds = pd.read_feather('data/classification_results/pairwise_balanced_accuracy_results_all_folds.feather')
pairwise_p_values = pd.read_feather('data/classification_results/pairwise_p_values.feather')

combined_univariate_pairwise_balanced_accuracy_results = pd.read_feather('data/classification_results/combined_univariate_pairwise_balanced_accuracy_results.feather')
combined_univariate_pairwise_balanced_accuracy_results_all_folds = pd.read_feather('data/classification_results/combined_univariate_pairwise_balanced_accuracy_results_all_folds.feather')
combined_univariate_pairwise_p_values = pd.read_feather('data/classification_results/combined_univariate_pairwise_p_values.feather')

# Combine balanced accuracy results 
all_balanced_accuracy_results = pd.concat([univariate_balanced_accuracy_results, pairwise_balanced_accuracy_results, combined_univariate_pairwise_balanced_accuracy_results], axis=0)

# Load null distributions
univariate_null_distribution = pd.read_feather('data/classification_results/univariate_null_balanced_accuracy_distribution.feather')
pairwise_null_distribution = pd.read_feather('data/classification_results/pairwise_null_balanced_accuracy_distribution.feather')
combined_univariate_pairwise_null_distribution = pd.read_feather('data/classification_results/combined_univariate_pairwise_null_balanced_accuracy_distribution.feather')

# Load confound balanced accuracy 
confounds_balanced_accuracy_results = pd.read_feather('data/classification_results/confound_analysis/confounds_predicting_dx_balanced_accuracy_results.feather')
confounds_balanced_accuracy_results_all_folds = pd.read_feather('data/classification_results/confound_analysis/confounds_predicting_dx_balanced_accuracy_results_all_folds.feather')
time_series_features_predicting_confounds_score_results = pd.read_feather('data/classification_results/confound_analysis/time_series_features_predicting_confounds_score_results.feather')

# Load training balanced accuracy results 
training_balanced_accuracy_results = pd.read_feather("data/classification_results/robustness_analysis/all_training_balacc_results.feather")

# Load first 25 PCs for univariate combo data
univariate_combo_first_25_PCs = pd.read_feather("data/time_series_features/univariate_combo_first25_PCs.feather")
univariate_combo_first_25_PCs_classification_res = pd.read_feather("data/classification_results/robustness_analysis/All_univariate_combo_first_25_PCs_classification_res.feather")
univariate_combo_L1_regularized_classification_res = pd.read_feather("data/classification_results/robustness_analysis/All_univariate_combo_L1_regularized_linear_SVM_classification_res.feather")

# ABIDE ASD site-specific results for sites 20 and 5
ABIDE_site_specific_results = pd.read_feather("data/classification_results/robustness_analysis/ABIDE_site_specific_balanced_accuracy.feather")

# Age + sex classification results 
age_sex_feature_model_results = pd.read_feather("data/classification_results/confound_analysis/feature_based_models_with_age_and_sex_main_classification_res.feather")

# Read in results for (1) classifier choice and (2) nested cross-validation comparison 
classifier_type_classification_res = pd.read_feather("data/classification_results/robustness_analysis/all_classifier_types_results.feather")
nested_CV_classification_res = pd.read_feather("data/classification_results/robustness_analysis/all_nested_CV_results.feather")

# Load in the region-wise volumes data
UCLA_CNP_region_wise_volumes = pd.read_feather("data/input_data/UCLA_CNP_region_wise_volumes.feather")

# Study/disorder lookup table
study_disorder_lookup = {'SCZ': 'UCLA_CNP', 
                          'BP': 'UCLA_CNP', 
                          'ADHD': 'UCLA_CNP', 
                          'ASD': 'ABIDE'}

ABIDE_brain_region_info = pd.read_excel("data/input_data/ABIDE_Brain_Region_info.xlsx")

In [4]:
%%R -o study_group_df,UCLA_CNP_brain_region_info,aparc_aseg_LUT

# Load brain region info
UCLA_CNP_brain_region_info <- read.csv("data/input_data/UCLA_CNP_Brain_Region_info.csv")  %>%
  rowwise() %>%
    mutate(Hemisphere = ifelse(str_detect(Brain_Region, "ctx-lh|Left-"), "L", "R"),
            Region_Only = gsub("ctx-lh-|ctx-rh-|Left-|Right-", "", Brain_Region)) %>%
    mutate(Figure_Name = paste0(Hemisphere, " ", Region_Only), .keep="unused")

aparc_aseg_LUT <- read.table("data/input_data/FreeSurferLUT.txt",
                             header=T) %>%
  dplyr::rename("ROI_Index" = "Value")

# Compile a study lookup table
study_group_df <- data.frame(Study = c(rep("UCLA_CNP", 3), "ABIDE"),
                              Disorder = c("SCZ", "BP", "ADHD", "ASD"))

## Table S1: Brain region classification results

In [8]:
%%R -i univariate_p_values
# Save the resulting significant region performance to a CSV file
univariate_p_values %>%
  filter(Analysis_Type=="Brain_Region") %>%
  dplyr::select(Disorder, group_var, p_value, p_value_BenjaminiHochberg, Balanced_Accuracy, Balanced_Accuracy_SD) %>%
  dplyr::rename("Brain Region" = "group_var") %>%
  mutate(Balanced_Accuracy = round(100*Balanced_Accuracy,1),
         Balanced_Accuracy_SD = round(100*Balanced_Accuracy_SD,1),
         p_value = format(p_value, digits=1),
         p_value_BenjaminiHochberg = format(p_value_BenjaminiHochberg, digits=1)) %>%
  dplyr::rename("Balanced Accuracy" = "Balanced_Accuracy",
                "SD" = "Balanced_Accuracy_SD",
                "p-value" = "p_value",
                "Benjamini--Hochberg p-value" = "p_value_BenjaminiHochberg") %>%
  mutate(Disorder = factor(Disorder, levels = c("SCZ", "BP", "ADHD", "ASD"))) %>%
  ungroup() %>%
  arrange(Disorder, desc(`Balanced Accuracy`)) %>%
  dplyr::select(Disorder, `Brain Region`, `Balanced Accuracy`, `SD`, `p-value`, `Benjamini--Hochberg p-value`) %>%
  write.csv(., "tables/Table_S1_univariate_brain_region_wise_results.csv", row.names = F)

## Table S2: ABIDE site-specific results

In [13]:
%%R -i ABIDE_site_specific_results

ABIDE_site_specific_results %>% 
    dplyr::select(Site_Number, Analysis_Type, group_var, Balanced_Accuracy, Balanced_Accuracy_SD) %>%
    mutate(Balanced_Accuracy = round(100*Balanced_Accuracy,1),
           Balanced_Accuracy_SD = round(100*Balanced_Accuracy_SD,1)) %>%
    rename("Analysis" = "Analysis_Type", 
           "Model" = "group_var",
           "Balanced Accuracy" = "Balanced_Accuracy",
           "SD" = "Balanced_Accuracy_SD") %>% 
    write.csv(., "tables/Table_S2_ABIDE_site_specific_results.csv", row.names = F)

## Table S3: Gray matter volume analyses

In [46]:
%%R -i UCLA_CNP_region_wise_volumes,UCLA_CNP_metadata -o ROI_volume_beta_by_group

# Find mean volume by disorder 
mean_volume_by_diagnosis <- UCLA_CNP_region_wise_volumes %>% 
    left_join(., UCLA_CNP_metadata) %>% 
    group_by(Diagnosis, Brain_Region) %>%
    summarise(Mean_Volume = mean(Num_Voxels, na.rm=T)) %>% 
    group_by(Brain_Region) %>% 
    mutate(Control_Volume = mean(Mean_Volume[Diagnosis=="Control"], na.rm=T)) %>% 
    filter(Diagnosis != "Control") %>%
    dplyr::rename("Disorder_Volume" = "Mean_Volume",
                  "Disorder" = "Diagnosis")

# Fit OLS models to extract beta coefficient for regional volumes in each group relative to control
run_lm_beta_stats_for_group <- function(Disorder, region_wise_volumes){
  res <- region_wise_volumes %>%
    left_join(., UCLA_CNP_metadata, by = join_by(Study, Sample_ID)) %>%
    mutate(Diagnosis = case_when(Diagnosis == "Schizophrenia" ~ "SCZ",
                                 Diagnosis == "Bipolar" ~ "BP",
                                 T ~ Diagnosis)) %>%
    filter(Diagnosis %in% c(Disorder, "Control")) %>%
    dplyr::select(Brain_Region, Diagnosis, Num_Voxels) %>%
    mutate(Diagnosis = factor(Diagnosis, levels = c("Control", Disorder))) %>%
    group_by(Brain_Region) %>%
    nest() %>%
    mutate(
      fit = map(data, ~ lm(Num_Voxels ~ Diagnosis, data = .x)),
      tidied = map(fit, tidy)
    ) %>% 
    unnest(tidied) %>%
    dplyr::select(-data, -fit) %>%
    ungroup() %>%
    filter(term != "(Intercept)") %>%
    mutate(Disorder = Disorder)
  
  return(res)
}

ROI_volume_beta_by_group <- 1:3 %>%
  purrr::map_df(~ run_lm_beta_stats_for_group(region_wise_volumes = UCLA_CNP_region_wise_volumes,
                                              Disorder = study_group_df$Disorder[.x])) %>%
  group_by(Disorder) %>%
  mutate(p_value_BenjaminiHochberg = p.adjust(p.value, method="BH"))  %>%
  ungroup() %>%
  left_join(., mean_volume_by_diagnosis) %>%
  mutate(estimate = round(estimate,1),
         p_value = format(p.value, digits=1),
         p_value_BenjaminiHochberg = format(p_value_BenjaminiHochberg, digits=1),
         Disorder_Volume = format(Disorder_Volume, digits=1),
         Control_Volume = format(Control_Volume, digits=1)) %>%
  dplyr::select(Disorder, Brain_Region, Control_Volume, Disorder_Volume, estimate, p_value, p_value_BenjaminiHochberg) %>%
  dplyr::rename("Brain Region" = "Brain_Region",
                "Control Volume" = "Control_Volume",
                "Disorder Volume" = "Disorder_Volume",
                "beta" = "estimate",
                "p-value" = "p_value",
                "Benjamini--Hochberg p-value" = "p_value_BenjaminiHochberg") %>%
  write.csv(., "tables/Table_S3_region_volume_differences.csv", row.names = F)

Joining with `by = join_by(Study, Sample_ID)`
`summarise()` has grouped output by 'Diagnosis'. You can override using the
`.groups` argument.
Joining with `by = join_by(Brain_Region, Disorder)`


## Table S4: Classification performance with age + sex alone per disorder

In [51]:
%%R -i confounds_balanced_accuracy_results

confounds_balanced_accuracy_results %>%
    filter(Analysis_Type == "Age") %>%   
    mutate(Balanced_Accuracy = round(100*Balanced_Accuracy,1),
           Balanced_Accuracy_SD = round(100*Balanced_Accuracy_SD,1)) %>%
    dplyr::select(Study, Disorder, Analysis_Type, Balanced_Accuracy, Balanced_Accuracy_SD) %>% 
    rename("Analysis" = "Analysis_Type", 
           "Balanced Accuracy" = "Balanced_Accuracy",
           "SD" = "Balanced_Accuracy_SD") %>% 
    write.csv(., "tables/Table_S4_Age_classification_results.csv", row.names = F)

## Table S5: Age+sex, BOLD feature, Age+sex+BOLD feature classification results

In [28]:
%%R -i univariate_TS_feature_info,all_balanced_accuracy_results,confounds_balanced_accuracy_results,age_sex_feature_model_results

all_balanced_accuracy_results_here <- all_balanced_accuracy_results %>% 
    dplyr::rename("Features_Only" = "Balanced_Accuracy") %>%
    dplyr::select(group_var, Analysis_Type, Disorder, Study, Features_Only)

age_sex_balanced_accuracy_results_here <- confounds_balanced_accuracy_results %>% 
    filter(Analysis_Type == "Age And Sex") %>% 
    dplyr::rename("Age_Sex_Only" = "Balanced_Accuracy") %>%
    dplyr::select(Disorder, Study, Age_Sex_Only)

age_sex_feature_model_results_here <- age_sex_feature_model_results  %>% 
    dplyr::rename("Age_Sex_and_Features" = "Balanced_Accuracy")  %>%
    dplyr::select(group_var, Disorder, Analysis_Type, Study, Age_Sex_and_Features)

all_models_combined_here <- left_join(all_balanced_accuracy_results_here,
                                      age_sex_feature_model_results_here) %>% 
                                left_join(., age_sex_balanced_accuracy_results_here)

# Write the CSV
all_models_combined_here %>% 
    dplyr::rename("feature_name" = "group_var") %>% 
    left_join(., univariate_TS_feature_info) %>% 
    dplyr::mutate(group_var = ifelse(is.na(Figure_name), feature_name, Figure_name)) %>% 
    pivot_longer(cols=c(Features_Only, Age_Sex_Only, Age_Sex_and_Features), 
                 names_to = "Model_Type", values_to="Balanced_Accuracy") %>% 
    mutate(Model_Type = factor(Model_Type, levels = c("Age_Sex_Only", "Features_Only", "Age_Sex_and_Features")),
           Disorder = factor(Disorder, levels = c("SCZ", "BP", "ADHD", "ASD")),
           Analysis_Type = factor(Analysis_Type, levels = c("Brain_Region", "catch25_feature", 
                                                            "Univariate_Combo", "SPI", "SPI_Combo"))) %>% 
    mutate(Balanced_Accuracy = round(100*Balanced_Accuracy,1)) %>%
    pivot_wider(id_cols = c(Study, Disorder, Analysis_Type, group_var),
                names_from = Model_Type, values_from = Balanced_Accuracy) %>% 
    dplyr::select(Study, Disorder, Analysis_Type, group_var, Age_Sex_Only, Features_Only, Age_Sex_and_Features) %>% 
    dplyr::rename("Age and Sex" = "Age_Sex_Only", 
                  "BOLD Features" = "Features_Only",
                  "Age, Sex, and BOLD Features" = "Age_Sex_and_Features",
                  "Model name" = "group_var",
                  "Analysis" = "Analysis_Type") %>% 
    write.csv("tables/Table_S5_Age_sex_BOLD_feature_classification_results.csv", row.names = F)

Joining with `by = join_by(group_var, Analysis_Type, Disorder, Study)`
Joining with `by = join_by(Disorder, Study)`
Joining with `by = join_by(feature_name)`




## Table S6: Univariate time-series feature classification results

In [16]:
%%R -i univariate_p_values,univariate_TS_feature_info
# Save the resulting significant region performance to a CSV file
univariate_p_values %>%
  filter(Analysis_Type=="catch25_feature") %>%
  dplyr::select(Disorder, group_var, p_value, p_value_BenjaminiHochberg, Balanced_Accuracy, Balanced_Accuracy_SD) %>%
  dplyr::rename("feature_name" = "group_var") %>%
  left_join(., univariate_TS_feature_info) %>%
  dplyr::rename("Time-series feature" = "Figure_name") %>%
  mutate(Balanced_Accuracy = round(100*Balanced_Accuracy,1),
         Balanced_Accuracy_SD = round(100*Balanced_Accuracy_SD,1),
         p_value = format(p_value, digits=1),
         p_value_BenjaminiHochberg = format(p_value_BenjaminiHochberg, digits=1)) %>%
  dplyr::rename("Balanced Accuracy" = "Balanced_Accuracy",
                "SD" = "Balanced_Accuracy_SD",
                "p-value" = "p_value",
                "Benjamini--Hochberg p-value" = "p_value_BenjaminiHochberg") %>%
  mutate(Disorder = factor(Disorder, levels = c("SCZ", "BP", "ADHD", "ASD"))) %>%
  ungroup() %>%
  arrange(Disorder, desc(`Balanced Accuracy`)) %>%
  dplyr::select(Disorder, `Time-series feature`, `Balanced Accuracy`, `SD`, `p-value`, `Benjamini--Hochberg p-value`) %>%
  write.csv(., "tables/Table_S6_univariate_time_series_feature_wise_results.csv", row.names = F)

Joining with `by = join_by(feature_name)`


## Table S7: catch25 feature similarity indices

In [23]:
data_for_corr_long = pd.read_csv("tables/catch25_top_feature_similarity_spearman_corr.csv")

# Save as table S7
data_for_corr_long.to_csv("tables/Table_S7_univariate_time_series_spearman_correlation.csv", index=False)

## Table S8: Comparing head movement between each disorder and controls

In [32]:
%%R -i merged_metadata,study_group_df -o age_res_df,mean_FD_res_df,sex_res_df

mean_FD_res_list <- list()

for (i in 1:nrow(study_group_df)) {
  disorder <- study_group_df$Disorder[i]
  study <- study_group_df$Study[i]
  disorder_ctrl_data <- merged_metadata %>%
    filter(Diagnosis %in% c("Control", disorder) & Study == study)

  disorder_mean_FD_res <- disorder_ctrl_data %>%
    wilcox_test(Mean_FD_Power ~ Diagnosis) %>% 
    mutate(Disorder = disorder, Study = study, Test = "Mean_FD_Power", 
           Asterisk = case_when(p < 0.001 ~ "***",
                                p < 0.01 ~ "**", 
                                p < 0.05 ~ "*", 
                                T ~ ""))
  mean_FD_res_list <- list.append(mean_FD_res_list, disorder_mean_FD_res)
}

# Combine the results
mean_FD_res_df <- do.call(rbind, mean_FD_res_list)
mean_FD_res_df %>% 
  dplyr::select(Disorder, Study, statistic, p) %>% 
  mutate(p = format(p, digits=1)) %>%
  dplyr::rename("Wilcoxon rank-sum statistic" = "statistic", 
                "P-value" = "p") %>% 
  write.csv(., "tables/Table_S8_Mean_FD_Power_comparisons.csv", row.names = F)



## Table S9: Brain-wide SD values and mean FD_Power per participant

In [43]:
merged_regional_SD = pd.read_feather("data/time_series_features/Merged_Regional_SD_values.feather")
                                 
whole_brain_avg_SD = (merged_regional_SD.groupby(['Sample_ID', 'Study', 'Diagnosis'], as_index=False)
                      .agg(Mean_BOLD_SD = ('values', 'mean'))
                      .merge(merged_metadata))

In [42]:
%%R -i whole_brain_avg_SD

whole_brain_avg_SD %>% 
    arrange(Study, Diagnosis) %>%
    dplyr::select(Study, Diagnosis, Sample_ID, Mean_BOLD_SD, Mean_FD_Power) %>% 
    mutate(Mean_BOLD_SD = round(Mean_BOLD_SD, 2),
           Mean_FD_Power = round(Mean_FD_Power, 2)) %>%
    dplyr::rename("Whole-brain BOLD SD" = "Mean_BOLD_SD", 
                  "Whole-brain framewise displacement (mm)" = "Mean_FD_Power") %>%
    write.csv("tables/Table_S9_Brain_wide_SD_vs_mean_FD.csv", row.names=F)

## Table S10: Region-wise correlation coefficients between BOLD SD and mean FD

In [45]:
region_SD_mean_FD_corrs_list = []

for disorder in ["SCZ", "ASD"]: 
    study = study_disorder_lookup[disorder] 

    # Individual brain regions
    disorder_region_SD_values = (merged_regional_SD
                            .query("Diagnosis in ['Control', @disorder] & Study == @study")
                            .merge(merged_metadata))
    
    region_wise_corrs = (disorder_region_SD_values
        .filter(["Sample_ID", "Brain_Region", "values", "Diagnosis", "Study", "Mean_FD_Power"])
        .groupby(["Brain_Region"])
        .apply(lambda x: pearsonr(x["values"], x["Mean_FD_Power"])))

    # Convert to DataFrame
    region_wise_corrs_df = pd.DataFrame(region_wise_corrs).reset_index()
    region_wise_corrs_df[['Pearson_R', 'p_value']] = pd.DataFrame(region_wise_corrs_df[0].tolist(), index=region_wise_corrs_df.index)
    region_wise_corrs_df.drop(0, axis=1, inplace=True)

    # Assign disorder, study, and corrected p-values 
    region_wise_corrs_df = (region_wise_corrs_df
                            .assign(Disorder=disorder, Study=study, 
                                    p_value_BenjaminiHochberg=multipletests(region_wise_corrs_df.p_value, method="fdr_bh")[1]))
    region_SD_mean_FD_corrs_list.append(region_wise_corrs_df)
    
region_SD_mean_FD_corrs = pd.concat(region_SD_mean_FD_corrs_list).reset_index()


In [48]:
%%R -i region_SD_mean_FD_corrs

region_SD_mean_FD_corrs %>%
    dplyr::select(Study, Disorder, Brain_Region, Pearson_R, p_value, p_value_BenjaminiHochberg) %>%
    mutate(Pearson_R = round(Pearson_R, 2),
           p_value = format(p_value, digits=1),
           p_value_BenjaminiHochberg = format(p_value_BenjaminiHochberg, digits=1)) %>%
    dplyr::rename("Brain Region" = "Brain_Region",
                  "Pearson R" = "Pearson_R",
                  "p-value" = "p_value",
                  "Benjamini--Hochberg p-value" = "p_value_BenjaminiHochberg") %>% 
    write.csv(., "tables/Table_S10_Region_wise_BOLD_SD_and_mean_FD_correlations.csv", row.names = F)

## Table S11: Univariate combo classification results

In [51]:
%%R -i univariate_p_values
# Save the resulting significant region performance to a CSV file
univariate_p_values %>%
  filter(Analysis_Type=="Univariate_Combo") %>%
  dplyr::select(Disorder, p_value, p_value_BenjaminiHochberg, Balanced_Accuracy, Balanced_Accuracy_SD) %>%
  mutate(Balanced_Accuracy = round(100*Balanced_Accuracy,1),
         Balanced_Accuracy_SD = round(100*Balanced_Accuracy_SD,1),
         p_value = format(p_value, digits=1)) %>%
  dplyr::rename("Balanced Accuracy" = "Balanced_Accuracy",
                "SD" = "Balanced_Accuracy_SD",
                "p-value" = "p_value") %>%
  mutate(Disorder = factor(Disorder, levels = c("SCZ", "BP", "ADHD", "ASD"))) %>%
  ungroup() %>%
  arrange(Disorder, desc(`Balanced Accuracy`)) %>%
  dplyr::select(Disorder, `Balanced Accuracy`, `SD`, `p-value`) %>%
  write.csv(., "tables/Table_S11_univariate_combo_results.csv", row.names = F)

## Table S12: Five-classifier robustness analysis comparison

In [64]:
%%R -i classifier_type_classification_res 

classifier_type_classification_res %>%
    dplyr::rename("feature_name" = "group_var") %>% 
    left_join(., univariate_TS_feature_info) %>% 
    dplyr::mutate(group_var = ifelse(is.na(Figure_name), feature_name, Figure_name),
                  Balanced_Accuracy = round(100*Balanced_Accuracy, 1),
                  Classifier_Type = case_when(Classifier_Type == "Linear_SVM_sklearn" ~ "Linear_SVM",
                                              Classifier_Type == "Linear_SVM_libsvm" ~ "Linear_SVM_L1_regularized",
                                              T ~ Classifier_Type)) %>% 
    dplyr::select(Study, Disorder, Analysis_Type, group_var, Classifier_Type, Balanced_Accuracy) %>%
    pivot_wider(id_cols=c(Study,Disorder,Analysis_Type,group_var), 
                names_from=Classifier_Type, values_from=Balanced_Accuracy) %>%
    dplyr::rename("Analysis" = "Analysis_Type",
                  "Model" = "group_var") %>%
    write.csv(., "tables/Table_S12_Robustness_classifier_type_results.csv", row.names=F)

Joining with `by = join_by(feature_name)`


## Table S13: Nested CV Analysis

In [66]:
%%R -i nested_CV_classification_res,univariate_balanced_accuracy_results 

nested_CV_classification_res %>%
    mutate(Classifier_Type = "Nested_CV") %>%
    plyr::rbind.fill(., univariate_balanced_accuracy_results %>% mutate(Classifier_Type = "Regular_CV")) %>%
    dplyr::rename("feature_name" = "group_var") %>% 
    left_join(., univariate_TS_feature_info) %>% 
    dplyr::mutate(group_var = ifelse(is.na(Figure_name), feature_name, Figure_name),
                  Balanced_Accuracy = round(100*Balanced_Accuracy, 1)) %>% 
    dplyr::select(Study, Disorder, Analysis_Type, group_var, Classifier_Type, Balanced_Accuracy) %>%
    pivot_wider(id_cols=c(Study,Disorder,Analysis_Type,group_var), 
                names_from=Classifier_Type, values_from=Balanced_Accuracy) %>%
    dplyr::rename("Analysis" = "Analysis_Type",
                  "Model" = "group_var") %>%
    write.csv(., "tables/Table_S13_Nested_CV_comparison_results.csv", row.names=F)

Joining with `by = join_by(feature_name)`


## Table S14: First 25 PCs from uni_combo by disorder

In [67]:
univariate_combo_first_25_PCs.head()

Unnamed: 0,index,Dim.1,Dim.2,Dim.3,Dim.4,Dim.5,Dim.6,Dim.7,Dim.8,Dim.9,...,Dim.20,Dim.21,Dim.22,Dim.23,Dim.24,Dim.25,Sample_ID,Diagnosis,Disorder,Study
0,Schizophrenia.1,7.763461,-11.992147,-15.470965,-13.912214,5.142571,-2.794405,-0.336542,9.285105,0.53504,...,-0.727008,1.479596,-11.084965,0.484343,1.700665,0.965886,sub-10159,Control,SCZ,UCLA_CNP
1,Schizophrenia.2,-1.154705,8.355952,12.421492,-1.755694,-0.562015,-5.013665,-0.394382,1.97837,-1.565298,...,-5.366631,4.716936,0.77283,-6.248253,-3.226295,-0.38939,sub-10171,Control,SCZ,UCLA_CNP
2,Schizophrenia.3,-14.731994,-2.146882,-7.468358,-12.924348,2.766982,6.065319,-9.053134,-3.399242,-2.903294,...,1.674169,-3.610891,-1.230216,-4.783337,-2.20763,3.442107,sub-10189,Control,SCZ,UCLA_CNP
3,Schizophrenia.4,16.214803,6.741375,-8.986557,-1.938153,2.17243,-11.955224,-7.993688,-2.246972,-2.669048,...,3.664696,-1.308004,0.997495,-6.063672,4.146329,5.984317,sub-10206,Control,SCZ,UCLA_CNP
4,Schizophrenia.5,15.340841,-6.730533,-3.957819,-9.995718,2.286904,-4.781978,-2.266357,-8.863625,3.900799,...,-0.885412,2.598588,1.778124,0.915183,3.598612,4.443876,sub-10217,Control,SCZ,UCLA_CNP


In [70]:
%%R -i univariate_combo_first_25_PCs

univariate_combo_first_25_PCs %>%
    dplyr::select(Sample_ID, Study, Diagnosis, `Dim.1`:`Dim.25`) %>% 
    write.csv("tables/Table_S14_univariate_combo_first_25_PCs.csv", row.names=F)

## Table S15: pyspi SPI-wise classification results

In [72]:
%%R -i pairwise_p_values,pairwise_TS_feature_info
# Save the resulting significant region performance to a CSV file
pairwise_p_values %>%
  dplyr::select(Disorder, group_var, p_value, p_value_BenjaminiHochberg, Balanced_Accuracy, Balanced_Accuracy_SD) %>%
  dplyr::rename("pyspi_name" = "group_var") %>%
  left_join(., pairwise_TS_feature_info) %>%
  dplyr::rename("SPI" = "Figure_name") %>%
  mutate(Balanced_Accuracy = round(100*Balanced_Accuracy,1),
         Balanced_Accuracy_SD = round(100*Balanced_Accuracy_SD,1),
         p_value = format(p_value, digits=1),
         p_value_BenjaminiHochberg = format(p_value_BenjaminiHochberg, digits=1)) %>%
  dplyr::rename("Balanced Accuracy" = "Balanced_Accuracy",
                "SD" = "Balanced_Accuracy_SD",
                "p-value" = "p_value",
                "Benjamini--Hochberg p-value" = "p_value_BenjaminiHochberg") %>%
  mutate(Disorder = factor(Disorder, levels = c("SCZ", "BP", "ADHD", "ASD"))) %>%
  ungroup() %>%
  arrange(Disorder, desc(`Balanced Accuracy`)) %>%
  dplyr::select(Disorder, SPI, `Balanced Accuracy`, `SD`, `p-value`, `Benjamini--Hochberg p-value`) %>%
  write.csv(., "tables/Table_S15_pairwise_SPI_wise_results.csv", row.names = F)

Joining with `by = join_by(pyspi_name)`


## Table 16: SPI-wise similarity indices

In [74]:
spearman_rho_SPI_df = pd.read_csv("data/time_series_features/SPI_performance_correlation_across_disorders.csv")


Unnamed: 0.1,Unnamed: 0,index,SPI1,SPI2,Spearman_Rho
0,0,0,cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5,cov_EmpiricalCovariance,0.063558
1,1,0,cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5,di_gaussian,0.685875
2,2,0,cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5,dtw_constraint-itakura,-0.07611
3,3,0,cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5,pec,0.277649
4,4,0,cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5,phi_star_t-1_norm-0,0.568166


In [81]:
%%R -i spearman_rho_SPI_df 

spearman_rho_SPI_df %>% 
    plyr::rbind.fill(., spearman_rho_SPI_df %>% mutate(SPI1_new = SPI2, SPI2_new = SPI1) %>% 
                            dplyr::select(-SPI1, -SPI2) %>% 
                            dplyr::rename("SPI1" = "SPI1_new",
                                          "SPI2" = "SPI2_new")) %>% 
    mutate(spearman_abs_corr = round(abs(Spearman_Rho), 2)) %>%
    dplyr::select(SPI1, SPI2, spearman_abs_corr) %>% 
    arrange(SPI1) %>%
    dplyr::rename("Absolute Spearman correlation" = "spearman_abs_corr") %>%
    write.csv(., "tables/Table_S16_SPI_wise_correlation_across_disorders.csv", row.names=F)

## Table S17: SPI with local univariate dynamics performance

In [83]:
%%R -i combined_univariate_pairwise_p_values,pairwise_TS_feature_info
# Save the resulting significant region performance to a CSV file
combined_univariate_pairwise_p_values %>%
  dplyr::select(Disorder, group_var, p_value, p_value_BenjaminiHochberg, Balanced_Accuracy, Balanced_Accuracy_SD) %>%
  dplyr::rename("pyspi_name" = "group_var") %>%
  left_join(., pairwise_TS_feature_info) %>%
  dplyr::rename("SPI" = "Figure_name") %>%
  mutate(Balanced_Accuracy = round(100*Balanced_Accuracy,1),
         Balanced_Accuracy_SD = round(100*Balanced_Accuracy_SD,1),
         p_value = format(p_value, digits=1),
         p_value_BenjaminiHochberg = format(p_value_BenjaminiHochberg, digits=1)) %>%
  dplyr::rename("Balanced Accuracy" = "Balanced_Accuracy",
                "SD" = "Balanced_Accuracy_SD",
                "p-value" = "p_value",
                "Benjamini--Hochberg p-value" = "p_value_BenjaminiHochberg") %>%
  mutate(Disorder = factor(Disorder, levels = c("SCZ", "BP", "ADHD", "ASD"))) %>%
  ungroup() %>%
  dplyr::select(Disorder, SPI, `Balanced Accuracy`, `SD`, `p-value`, `Benjamini--Hochberg p-value`) %>%
  write.csv(., "tables/Table_S17_combined_univariate_pairwise_SPI_wise_results.csv", row.names = F)

Joining with `by = join_by(pyspi_name)`


## Table S18: Corrected T-statistics comparing how each SPI performs with versus without local univariate dynamics

In [87]:
%%R -i pairwise_balanced_accuracy_results_all_folds,combined_univariate_pairwise_balanced_accuracy_results_all_folds,pairwise_p_values,combined_univariate_pairwise_p_values,study_group_df,UCLA_CNP_metadata,ABIDE_metadata,pairwise_TS_feature_info -o corrected_SPI_T_res

source("/Users/abry4213/github/correctR/R/repkfold_ttest.R")
source("code/data_visualization/visualization_helper_functions.R")

results_df = plyr::rbind.fill(pairwise_balanced_accuracy_results_all_folds %>%
                                left_join(pairwise_p_values %>%
                                dplyr::select(Study, Disorder, group_var, Classifier_Type, Analysis_Type, p_value_BenjaminiHochberg), 
                                by = join_by(group_var, Analysis_Type, Disorder, Study, Classifier_Type)), 
                              combined_univariate_pairwise_balanced_accuracy_results_all_folds %>% 
                              left_join(combined_univariate_pairwise_p_values %>%
                                dplyr::select(Study, Disorder, group_var, Classifier_Type, Analysis_Type, p_value_BenjaminiHochberg), 
                                by = join_by(group_var, Analysis_Type, Disorder, Study, Classifier_Type)))

corrected_SPI_T_res <- 1:nrow(study_group_df) %>%
  purrr::map_df(~ run_correctR_group(disorder = study_group_df$Disorder[.x],
                                     study = study_group_df$Study[.x],
                                     metadata = plyr::rbind.fill(UCLA_CNP_metadata, ABIDE_metadata),
                                     results_df = results_df)) %>%
  left_join(., pairwise_TS_feature_info, by=c("SPI"="pyspi_name"))

# Write to table 
corrected_SPI_T_res %>% 
    dplyr::select(Disorder, Figure_name, statistic, p.value, p_value_corr_BenjaminiHochberg) %>% 
    mutate(p_value = format(p.value, digits=1),
              p_value_corr_BenjaminiHochberg = format(p_value_corr_BenjaminiHochberg, digits=1),
              statistic = -1*statistic) %>%
    dplyr::rename("SPI" = "Figure_name",
                  "T-statistic" = "statistic",
                  "p-value" = "p.value",
                  "Benjamini--Hochberg p-value" = "p_value_corr_BenjaminiHochberg") %>%
    write.csv(., "tables/Table_S18_SPI_wise_corrected_T_test_results.csv", row.names = F)


[1] "error for disorder"
[1] "ADHD"


## Table S19: One-shot approach to case--control classification by representation type

In [5]:
training_balacc_results = pd.read_feather("data/classification_results/robustness_analysis/all_training_balacc_results.feather")

## Using PCA to make dimensionality consistent across representations
all_first_10_PCs_classification_results = pd.read_feather("data/classification_results/robustness_analysis/all_first_10_PCs_classification_results.feather")

In [17]:
%%R -i univariate_TS_feature_info,all_first_10_PCs_classification_results,training_balanced_accuracy_results,pairwise_balanced_accuracy_results_all_folds,combined_univariate_pairwise_balanced_accuracy_results_all_folds

SPI_first_10_PCs_classification_best_per_fold <- all_first_10_PCs_classification_results %>% 
    filter(Analysis_Type %in% c('pyspi14_SPI', 'SPI_Combo')) %>% 
    mutate(Analysis_Type = ifelse(Analysis_Type=="pyspi14_SPI", "SPI", Analysis_Type)) %>% 
    group_by(Analysis_Type, Disorder, Study, Fold, Repeat) %>% 
    arrange(desc(Train_Balanced_Accuracy)) %>% 
    mutate(Training_Rank = dense_rank(-Train_Balanced_Accuracy)) %>% 
    filter(Training_Rank==1)

# Subset main pairwise results to these SPI selections
SPI_balanced_accuracy_chosen_by_PCA <- plyr::rbind.fill(list(pairwise_balanced_accuracy_results_all_folds,
                                                             combined_univariate_pairwise_balanced_accuracy_results_all_folds)) %>%
                                       semi_join(., SPI_first_10_PCs_classification_best_per_fold) %>% 
                                       # Deal with ties
                                       group_by(Analysis_Type, Disorder, Study, Fold, Repeat) %>% 
                                       summarise(Balanced_Accuracy = mean(Balanced_Accuracy, na.rm=T), 
                                                 group_var = paste0(group_var, collapse = " "))


# Take only the top-performing region and time-series feature from training data folds
intra_regional_balanced_accuracy_best_from_in_sample_all_folds <- training_balanced_accuracy_results %>%
  filter(Analysis_Type %in% c("Brain_Region", "catch25_feature", "Univariate_Combo")) %>%
  dplyr::rename("feature_name" = "group_var") %>% 
  left_join(., univariate_TS_feature_info) %>% 
  dplyr::mutate(group_var = ifelse(is.na(Figure_name), feature_name, Figure_name)) %>%
  group_by(Study, Disorder, Analysis_Type, Fold, Repeat) %>%
  filter(Train_Balanced_Accuracy==max(Train_Balanced_Accuracy)) %>%
  # Take average in case of ties
  summarise(Balanced_Accuracy = mean(Balanced_Accuracy), 
            group_var = paste0(group_var, collapse = " ")) %>%
  ungroup() 

# Summarize data
chosen_model_per_represenatation_all <- intra_regional_balanced_accuracy_best_from_in_sample_all_folds %>%
  plyr::rbind.fill(., SPI_balanced_accuracy_chosen_by_PCA) %>%
  mutate(Analysis_Type = factor(Analysis_Type, levels = rev(c("Brain_Region", "catch25_feature", "Univariate_Combo", 
                                                          "SPI", "SPI_Combo"))), 
         Disorder = factor(Disorder, levels = c("SCZ", "BP", "ADHD", "ASD"))) 
  
chosen_model_per_represenatation_all %>% 
    mutate(Balanced_Accuracy = round(100*Balanced_Accuracy, 1),
           Fold = Fold + 1, 
           Repeat = Repeat + 1) %>%
    dplyr::rename("Analysis" = "Analysis_Type", 
                  "Balanced Accuracy" = "Balanced_Accuracy") %>%
    write.csv(., "tables/Table_S19_One_shot_classification_model_results.csv", row.names = F)

Joining with `by = join_by(Fold, Repeat, Analysis_Type, group_var, Study,
Disorder)`
`summarise()` has grouped output by 'Analysis_Type', 'Disorder', 'Study',
'Fold'. You can override using the `.groups` argument.
Joining with `by = join_by(feature_name)`
`summarise()` has grouped output by 'Study', 'Disorder', 'Analysis_Type',
'Fold'. You can override using the `.groups` argument.
