In [1]:
import os
import numpy as np
import nibabel as nib
import pandas as pd
import sys

# Uncomment the next two lines if you're running this on an HPC and want to speed up scikit-learn
# from sklearnex import patch_sklearn
# patch_sklearn()

%load_ext rpy2.ipython

In [2]:
%%R
# Load tidyverse R package
suppressPackageStartupMessages({
    library(tidyverse)
})

# Running linear SVMs

This Jupyter notebook includes all code needed to perform 10-repeated 10-fold cross-validation analysis using linear support vector machine (SVM) classifiers for each of the five representations.
First, we will import all functions from the `code/classification_analysis/core_classification_functions.py` script.

In [3]:
# add path to classification analysis functions
sys.path.insert(0, 'code/classification_analysis/')
from core_classification_functions import *
current_path = os.getcwd()

# Load participants included
UCLA_CNP_subjects_to_keep = pd.read_feather("data/time_series_features/UCLA_CNP_filtered_sample_info_catch25_pyspi14.feather")
ABIDE_subjects_to_keep = pd.read_feather("data/time_series_features/ABIDE_filtered_sample_info_catch25_pyspi14.feather")

# Load metadata
UCLA_CNP_metadata = (pd.read_feather("data/input_data/UCLA_CNP_sample_metadata.feather")
                        .assign(Study = "UCLA_CNP")
                        .query("Sample_ID in @UCLA_CNP_subjects_to_keep.Sample_ID"))
ABIDE_metadata = (pd.read_feather("data/input_data/ABIDE_sample_metadata.feather")
                        .assign(Study = "ABIDE")
                        .query("Sample_ID in @ABIDE_subjects_to_keep.Sample_ID"))
HCP100_metadata = pd.read_feather("data/input_data/HCP100_sample_metadata.feather")

# Load head movement 
UCLA_CNP_head_mvmt = (pd.read_table('data/movement_data/UCLA_CNP_Mean_FD_Power.txt', sep=',')
                      .assign(Study = "UCLA_CNP")
                        .query("Sample_ID in @UCLA_CNP_subjects_to_keep.Sample_ID"))
ABIDE_head_mvmt = (pd.read_table('data/movement_data/ABIDE_Mean_FD_Power.txt', sep=',', dtype={'Sample_ID': str,
                                                                                              'Mean_FD_Power': float})
                   .assign(Study = "ABIDE")
                        .query("Sample_ID in @ABIDE_subjects_to_keep.Sample_ID"))

# Merge metadata + head movement
merged_metadata = pd.concat([UCLA_CNP_metadata, ABIDE_metadata], axis=0).merge(pd.concat([UCLA_CNP_head_mvmt, ABIDE_head_mvmt], axis=0))

# Study/disorder lookup table
study_disorder_lookup = {'SCZ': 'UCLA_CNP', 
                          'BP': 'UCLA_CNP', 
                          'ADHD': 'UCLA_CNP', 
                          'ASD': 'ABIDE'}

In [4]:


# Load univariate time-series feature data for the two datasets
UCLA_CNP_univariate_features = pd.read_feather('data/time_series_features/UCLA_CNP_catch25_filtered.feather')
ABIDE_univariate_features = pd.read_feather('data/time_series_features/ABIDE_catch25_filtered.feather')
HCP100_univariate_features = pd.read_feather('data/time_series_features/HCP100_catch25_filtered.feather')

# # Load pairwise time-series feature data for the two datasets
# UCLA_CNP_pairwise_features = pd.read_feather('data/time_series_features/UCLA_CNP_pyspi14_filtered.feather')
# ABIDE_pairwise_features = pd.read_feather('data/time_series_features/ABIDE_pyspi14_filtered.feather')

# Load first 25 principal components for univariate region times feature combination matrices
univariate_combo_first25_PCs = pd.read_feather('data/time_series_features/univariate_combo_first25_PCs.feather')

# Load univariate time-series feature info
univariate_feature_info = pd.read_csv('data/feature_info/univariate_feature_info.csv')
pairwise_feature_info = pd.read_csv('data/feature_info/pairwise_feature_info.csv')

# Define parameters that you can change
univariate_feature_set = "catch25"
pairwise_feature_set = "pyspi14"
classifier_type = "Linear_SVM"
SPI_directionality_file = f"{current_path}/code/classification_analysis/SPI_Direction_Info.csv"

num_jobs = 1 # You can increase this if you are running this on an HPC with multiple cores available

## Core classification settings

In [5]:
num_folds = 10
num_repeats = 10
num_null_iters = 1000
RepeatedStratifiedKFold_splitter = RepeatedStratifiedKFold(n_splits=num_folds, n_repeats=num_repeats, random_state=127)

model = svm.SVC(kernel="linear", C=1, class_weight="balanced")
pipe = Pipeline([('scaler', MixedSigmoidScaler(unit_variance=True)),
                    ('model', model)])

scorers = [make_scorer(balanced_accuracy_score)]
scoring_names = ["Balanced_Accuracy"]


## $\mathrm{A_{region}}$, $\mathrm{A_{feature}}$, and $\mathrm{A_{uni\_combo}}$: Fitting linear SVMs to intra-regional univariate time-series properties

In [16]:
for disorder in study_disorder_lookup.keys():
    study = study_disorder_lookup[disorder]

    # Load class labels and sample IDs
    class_labels = np.load(f"data/input_data/{study}_{disorder}_class_labels.npy", allow_pickle=True).tolist()
    sample_IDs = np.load(f"data/input_data/{study}_{disorder}_sample_IDs.npy", allow_pickle=True).tolist()
    
    study_disorder_univariate_models = pd.read_table(f"data/time_series_features/processed_numpy_files/{study}_{disorder}_univariate_models.txt",
                                                     header=None)
    study_disorder_univariate_models.columns = ["Model_Name"]

    for model in study_disorder_univariate_models.Model_Name:

        # Define analysis type
        if "ROI" in model:
            Analysis_Type = "Brain_Region"
        elif "combo_catch25_features_all_regions" in model:
            Analysis_Type = "Univariate_Combo"
        else:
            Analysis_Type = "catch25_feature"
        # Define grouping var
        if Analysis_Type=="Brain_Region":
            grouping_var = model.split("_ROI_")[1]
        elif Analysis_Type=="Univariate_Combo":
            grouping_var = "Combo"
        else:
            grouping_var = model.split("_catch25_feature_")[1]

        # Read in the tabular data for the corresponding model
        model_data = np.load(f"data/time_series_features/processed_numpy_files/{model}.npy")

        # Define main output data file for this feature
        main_output_file_base = f"{study}_{disorder}_{Analysis_Type}_{grouping_var}_{classifier_type}_{num_repeats}_repeats_{num_folds}_folds_CV"

        # Fit classifier
        main_classification_res, _, null_classification_res = run_k_fold_classifier_for_feature(feature_data = model_data, 
                                                                                            pipe = pipe,
                                                                                            CV_splitter = RepeatedStratifiedKFold_splitter,
                                                                                            class_labels=class_labels,
                                                                                            sample_IDs = sample_IDs,
                                                                                            scorers=scorers,
                                                                                            scoring_names=scoring_names,
                                                                                            num_null_iters=num_null_iters,
                                                                                            num_folds = num_folds,
                                                                                            num_repeats = num_repeats,
                                                                                            num_jobs = num_jobs)
        
        # Assign key details to dataframes
        main_classification_res["Study"] = study 
        main_classification_res["Disorder"] = disorder
        main_classification_res["Analysis_Type"] = Analysis_Type
        main_classification_res["group_var"] = grouping_var
        main_classification_res["Classifier_Type"] = classifier_type

        # Save results to feather file
        main_classification_res.to_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}/{main_output_file_base}_main_res.feather")

        # If nulls were requested, save those too
        if num_null_iters > 0:
            null_classification_res["Study"] = study 
            null_classification_res["Disorder"] = disorder
            null_classification_res["Analysis_Type"] = Analysis_Type
            null_classification_res["group_var"] = grouping_var
            null_classification_res["Classifier_Type"] = classifier_type
            null_classification_res.to_feather(f"data/classification_results/null_results/{study}_{disorder}/{main_output_file_base}_nulls.feather")



After running this code, you should have the following two folders in your `data/classification_results/` directory:
- `balanced_accuracy/`: contains individual `.feather` files with the fold-wise balanced accuracy results for each disorder
- `null_results/`: contains individual `.feather` files with the null balanced accuracy distributions for each disorder

## $\mathrm{A_{FC}}$ and $\mathrm{A_{FC\_combo}}$: Fitting linear SVMs to inter-regional bivariate time-series properties

In [None]:
for disorder in study_disorder_lookup.keys():
    study = study_disorder_lookup[disorder]

    # Load class labels and sample IDs
    class_labels = np.load(f"data/input_data/{study}_{disorder}_class_labels.npy", allow_pickle=True).tolist()
    sample_IDs = np.load(f"data/input_data/{study}_{disorder}_sample_IDs.npy", allow_pickle=True).tolist()
    
    # Pairwise A_FC first
    study_disorder_pairwise_models = pd.read_table(f"data/time_series_features/processed_numpy_files/{study}_{disorder}_pairwise_models.txt",
                                                     header=None)
    study_disorder_pairwise_models.columns = ["Model_Name"]

    for model in study_disorder_pairwise_models.Model_Name:

        # Define analysis type
        Analysis_Type = "pyspi14_SPI"

        # Define grouping var
        grouping_var = model.split("_pyspi14_SPI_")[1]

        # Read in the tabular data for the corresponding model
        model_data = np.load(f"data/time_series_features/processed_numpy_files/{model}.npy")

        # Define main output data file for this feature
        main_output_file_base = f"{study}_{disorder}_{Analysis_Type}_{grouping_var}_{classifier_type}_{num_repeats}_repeats_{num_folds}_folds_CV"

        # Fit classifier
        main_classification_res, _, null_classification_res = run_k_fold_classifier_for_feature(feature_data = model_data, 
                                                                                            pipe = pipe,
                                                                                            CV_splitter = RepeatedStratifiedKFold_splitter,
                                                                                            class_labels=class_labels,
                                                                                            sample_IDs = sample_IDs,
                                                                                            scorers=scorers,
                                                                                            scoring_names=scoring_names,
                                                                                            num_null_iters=num_null_iters,
                                                                                            num_folds = num_folds,
                                                                                            num_repeats = num_repeats,
                                                                                            num_jobs = num_jobs)
        
        # Assign key details to dataframes
        main_classification_res["Study"] = study 
        main_classification_res["Disorder"] = disorder
        main_classification_res["Analysis_Type"] = Analysis_Type
        main_classification_res["group_var"] = grouping_var
        main_classification_res["Classifier_Type"] = classifier_type

        # # Save results to feather file
        # main_classification_res.to_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}/{main_output_file_base}_main_res.feather")

        # If nulls were requested, save those too
        if num_null_iters > 0:
            null_classification_res["Study"] = study 
            null_classification_res["Disorder"] = disorder
            null_classification_res["Analysis_Type"] = Analysis_Type
            null_classification_res["group_var"] = grouping_var
            null_classification_res["Classifier_Type"] = classifier_type
            null_classification_res.to_feather(f"data/classification_results/null_results/{study}_{disorder}/{main_output_file_base}_nulls.feather")


    # Then pairwise A_FC_combo
    study_disorder_combined_univariate_pairwise_models = pd.read_table(f"data/time_series_features/processed_numpy_files/{study}_{disorder}_combined_univariate_pairwise_models.txt",
                                                     header=None)
    study_disorder_combined_univariate_pairwise_models.columns = ["Model_Name"]

    for model in study_disorder_combined_univariate_pairwise_models.Model_Name:

        # Define analysis type
        Analysis_Type = "SPI_Combo"

        # Define grouping var
        grouping_var = model.split("combined_univariate_catch25_and_pyspi14_SPI_")[1]

        # Read in the tabular data for the corresponding model
        model_data = np.load(f"data/time_series_features/processed_numpy_files/{model}.npy")

        # Define main output data file for this feature
        main_output_file_base = f"{study}_{disorder}_{Analysis_Type}_{grouping_var}_{classifier_type}_{num_repeats}_repeats_{num_folds}_folds_CV"

        # Fit classifier
        main_classification_res, _, null_classification_res = run_k_fold_classifier_for_feature(feature_data = model_data, 
                                                                                            pipe = pipe,
                                                                                            CV_splitter = RepeatedStratifiedKFold_splitter,
                                                                                            class_labels=class_labels,
                                                                                            sample_IDs = sample_IDs,
                                                                                            scorers=scorers,
                                                                                            scoring_names=scoring_names,
                                                                                            num_null_iters=num_null_iters,
                                                                                            num_folds = num_folds,
                                                                                            num_repeats = num_repeats,
                                                                                            num_jobs = num_jobs)
        
        # Assign key details to dataframes
        main_classification_res["Study"] = study 
        main_classification_res["Disorder"] = disorder
        main_classification_res["Analysis_Type"] = Analysis_Type
        main_classification_res["group_var"] = grouping_var
        main_classification_res["Classifier_Type"] = classifier_type

        # Save results to feather file
        main_classification_res.to_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}/{main_output_file_base}_main_res.feather")

        # If nulls were requested, save those too
        if num_null_iters > 0:
            null_classification_res["Study"] = study 
            null_classification_res["Disorder"] = disorder
            null_classification_res["Analysis_Type"] = Analysis_Type
            null_classification_res["group_var"] = grouping_var
            null_classification_res["Classifier_Type"] = classifier_type
            null_classification_res.to_feather(f"data/classification_results/null_results/{study}_{disorder}/{main_output_file_base}_nulls.feather")

## Prepping results for visualization

Now that we have fit all linear SVMs and null models for each of the five representations, we can tabulate data into a format that is amenable to statistical analysis and data visualization.

First, we will compile balanced accuracy and null results that were saved separately across disorders into one feather file.

In [6]:
univariate_balanced_accuracy_results_list = []
pairwise_balanced_accuracy_results_list = []
combined_univariate_pairwise_balanced_accuracy_results_list = []

# Lists for all folds
univariate_balanced_accuracy_results_all_folds_list = []
pairwise_balanced_accuracy_results_all_folds_list = []
combined_univariate_pairwise_balanced_accuracy_results_all_folds_list = []

# Iterate over the four disorders
for disorder in study_disorder_lookup.keys():
    study = study_disorder_lookup[disorder]
    
    # Read in univariate results
    univariate_balanced_accuracy_results_disorder = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_univariate_Linear_SVM_sklearn_balanced_accuracy.feather")
                          .assign(Study=study, Disorder=disorder))
    univariate_balanced_accuracy_results_list.append(univariate_balanced_accuracy_results_disorder)
    
    univariate_balanced_accuracy_results_all_folds_disorder = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_univariate_Linear_SVM_sklearn_balanced_accuracy_all_folds.feather")
                          .assign(Study=study, Disorder=disorder))
    univariate_balanced_accuracy_results_all_folds_list.append(univariate_balanced_accuracy_results_all_folds_disorder)
    
    # Read in pairwise results and set Disorder to Comparison_Group if Disorder is NA
    pairwise_balanced_accuracy_results_disorder = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_pairwise_Linear_SVM_sklearn_balanced_accuracy.feather")
                        .assign(Study=study, Disorder=disorder))
    pairwise_balanced_accuracy_results_list.append(pairwise_balanced_accuracy_results_disorder)

    pairwise_balanced_accuracy_results_all_folds_disorder = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_pairwise_Linear_SVM_sklearn_balanced_accuracy_all_folds.feather")
                        .assign(Study=study, Disorder=disorder))
    pairwise_balanced_accuracy_results_all_folds_list.append(pairwise_balanced_accuracy_results_all_folds_disorder)
    
    # Read in combined univariate and pairwise results
    combined_univariate_pairwise_balanced_accuracy_results_disorder = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_combined_univariate_pairwise_Linear_SVM_sklearn_balanced_accuracy.feather")
                                        .assign(Study=study, Disorder=disorder))
    combined_univariate_pairwise_balanced_accuracy_results_list.append(combined_univariate_pairwise_balanced_accuracy_results_disorder)

    combined_univariate_pairwise_balanced_accuracy_results_all_folds_disorder = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_combined_univariate_pairwise_Linear_SVM_sklearn_balanced_accuracy_all_folds.feather")
                                        .assign(Study=study, Disorder=disorder))
    combined_univariate_pairwise_balanced_accuracy_results_all_folds_list.append(combined_univariate_pairwise_balanced_accuracy_results_all_folds_disorder)

# Concatenate results
univariate_balanced_accuracy_results = pd.concat(univariate_balanced_accuracy_results_list).reset_index(drop=True)
univariate_balanced_accuracy_results_all_folds = pd.concat(univariate_balanced_accuracy_results_all_folds_list).reset_index(drop=True)
pairwise_balanced_accuracy_results = pd.concat(pairwise_balanced_accuracy_results_list).reset_index(drop=True)
pairwise_balanced_accuracy_results_all_folds = pd.concat(pairwise_balanced_accuracy_results_all_folds_list).reset_index(drop=True)
combined_univariate_pairwise_balanced_accuracy_results = pd.concat(combined_univariate_pairwise_balanced_accuracy_results_list).reset_index(drop=True)
combined_univariate_pairwise_balanced_accuracy_results_all_folds = pd.concat(combined_univariate_pairwise_balanced_accuracy_results_all_folds_list).reset_index(drop=True)

# Add 1 to Fold and Repeat columns
univariate_balanced_accuracy_results_all_folds['Fold'] = univariate_balanced_accuracy_results_all_folds['Fold'] + 1
pairwise_balanced_accuracy_results_all_folds['Fold'] = pairwise_balanced_accuracy_results_all_folds['Fold'] + 1
combined_univariate_pairwise_balanced_accuracy_results_all_folds['Fold'] = combined_univariate_pairwise_balanced_accuracy_results_all_folds['Fold'] + 1
univariate_balanced_accuracy_results_all_folds['Repeat'] = univariate_balanced_accuracy_results_all_folds['Repeat'] + 1
pairwise_balanced_accuracy_results_all_folds['Repeat'] = pairwise_balanced_accuracy_results_all_folds['Repeat'] + 1
combined_univariate_pairwise_balanced_accuracy_results_all_folds['Repeat'] = combined_univariate_pairwise_balanced_accuracy_results_all_folds['Repeat'] + 1

# Save balanced accuracy results to feather files
univariate_balanced_accuracy_results.reset_index().to_feather("data/classification_results/univariate_balanced_accuracy_results.feather")
univariate_balanced_accuracy_results_all_folds.reset_index().to_feather("data/classification_results/univariate_balanced_accuracy_results_all_folds.feather")
pairwise_balanced_accuracy_results.reset_index().to_feather("data/classification_results/pairwise_balanced_accuracy_results.feather")
pairwise_balanced_accuracy_results_all_folds.reset_index().to_feather("data/classification_results/pairwise_balanced_accuracy_results_all_folds.feather")
combined_univariate_pairwise_balanced_accuracy_results.reset_index().to_feather("data/classification_results/combined_univariate_pairwise_balanced_accuracy_results.feather")
combined_univariate_pairwise_balanced_accuracy_results_all_folds.reset_index().to_feather("data/classification_results/combined_univariate_pairwise_balanced_accuracy_results_all_folds.feather")

In [8]:
# We can also compile the results for L1-regularized SVM and PCA-based SVM across the four disorders
L1_regularized_balanced_accuracy_list = []
PCA_balanced_accuracy_list = []

# Iterate over the four disorders
for disorder in study_disorder_lookup.keys():
    study = study_disorder_lookup[disorder]
    
    # Read in L1-regularized SVM results
    L1_regularized_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_Univariate_{univariate_feature_set}_{classifier_type}_Combo_L1_Regularized_balanced_accuracy.feather")
                          .assign(Study=study, Disorder=disorder))
    L1_regularized_balanced_accuracy_list.append(L1_regularized_results)
    
    # Read in PCA-based SVM results
    PCA_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_Univariate_{univariate_feature_set}_{classifier_type}_Combo_25_PCs_balanced_accuracy.feather")
                        .assign(Study=study, Disorder=disorder))
    PCA_balanced_accuracy_list.append(PCA_results)

# Concatenate results into one balanced accuracy dataframe
L1_regularized_balanced_accuracy_results = (pd.concat(L1_regularized_balanced_accuracy_list, axis=0)
                                            .reset_index(level=0, drop=True)
                                            .drop(columns=['index', 'level_0']))
PCA_balanced_accuracy_results = (pd.concat(PCA_balanced_accuracy_list, axis=0)
                                            .reset_index(level=0, drop=True)
                                            .drop(columns=['index', 'level_0']))

# Save to one feather file
L1_regularized_balanced_accuracy_results.to_feather("data/classification_results/all_L1_regularized_balanced_accuracy_results.feather")
PCA_balanced_accuracy_results.to_feather("data/classification_results/all_PCA_balanced_accuracy_results.feather")

We'll do the same compilation for fold assignments and null balanced accuracy distributions:

In [7]:
univariate_null_distribution_list = []
pairwise_null_distribution_list = []
combined_univariate_pairwise_null_distribution_list = []

# Iterate over the four disorders
for disorder in study_disorder_lookup.keys():
    study = study_disorder_lookup[disorder]
    
    # Read in null distribution results
    null_distribution_univariate = (pd.read_feather(f"data/classification_results/null_results/{study}_{disorder}_univariate_Linear_SVM_sklearn_null_balanced_accuracy_distributions.feather")
                         .assign(Study=study, Disorder=disorder))
    univariate_null_distribution_list.append(null_distribution_univariate)

    null_distribution_pairwise = (pd.read_feather(f"data/classification_results/null_results/{study}_{disorder}_pairwise_Linear_SVM_sklearn_null_balanced_accuracy_distributions.feather")
                         .assign(Study=study, Disorder=disorder))
    pairwise_null_distribution_list.append(null_distribution_pairwise)

    null_distribution_combo = (pd.read_feather(f"data/classification_results/null_results/{study}_{disorder}_combined_univariate_pairwise_Linear_SVM_sklearn_null_balanced_accuracy_distributions.feather")
                         .assign(Study=study, Disorder=disorder))
    combined_univariate_pairwise_null_distribution_list.append(null_distribution_combo)

# Concatenate results into one dataframe
univariate_null_distribution = pd.concat(univariate_null_distribution_list, axis=0).reset_index()
pairwise_null_distribution = pd.concat(pairwise_null_distribution_list, axis=0).reset_index()
combined_univariate_pairwise_null_distribution = pd.concat(combined_univariate_pairwise_null_distribution_list, axis=0).reset_index()

# Save null distributions to feather files
univariate_null_distribution.reset_index().to_feather("data/classification_results/univariate_null_balanced_accuracy_distribution.feather")
pairwise_null_distribution.reset_index().to_feather("data/classification_results/pairwise_null_balanced_accuracy_distribution.feather")
combined_univariate_pairwise_null_distribution.reset_index().to_feather("data/classification_results/combined_univariate_pairwise_null_balanced_accuracy_distribution.feather")


Next, we'll define a function to compute a $p$-value for a given observed balanced accuracy based on the corresponding empirical null distribution, comprising 1000 null balanced accuracy estimates for the given model.
From this distribution, we will derive the mean and SD null balanced accuracy, which will be used to compute the cumulative density function for the corresponding Gaussian distribution to obtain a p-value.

In [8]:
%%R -o compute_p_values

# Functions to calculate empirical p-value
compute_p_values <- function(observed_balanced_accuracy_df, null_distribution_df) {
  
  # Filter null to the same analysis type and grouping variable
  null_distribution_df <- null_distribution_df %>%
      dplyr::select(Analysis_Type, Disorder, Study, Balanced_Accuracy, group_var) %>%
      semi_join(., observed_balanced_accuracy_df %>% dplyr::select(Analysis_Type, Disorder, Study, Balanced_Accuracy, group_var),
                by = join_by(Analysis_Type, Disorder, Study, group_var))

  # Compare main balanced accuracy with that of the empirical null distribution
  observed_balanced_accuracy_value <- observed_balanced_accuracy_df$Balanced_Accuracy
  
  # Extract the mean and variance from the null distribution
  null_mean <- mean(null_distribution_df$Balanced_Accuracy)
  null_SD <- sd(null_distribution_df$Balanced_Accuracy)
  
  # Compute the probability of observing the main balanced accuracy given a null Gaussian distribution with the above parameters
  p_value <- pnorm(q=observed_balanced_accuracy_value, mean=null_mean, sd=null_SD, lower.tail = FALSE)
  
  # Organize results into dataframe to return
  observed_balanced_accuracy_df$p_value <- p_value
  return(observed_balanced_accuracy_df)
}


Now, we will compute $p$-values for all balanced accuracy results and save the results to a feather file.

In [11]:
%%R -i compute_p_values,univariate_balanced_accuracy_results,univariate_null_distribution,pairwise_balanced_accuracy_results,pairwise_null_distribution,combined_univariate_pairwise_balanced_accuracy_results,combined_univariate_pairwise_null_distribution -o univariate_p_values,pairwise_p_values,combined_univariate_pairwise_p_values

# Split the balanced accuracy results by study, disorder, and analysis type
univariate_balanced_accuracy_results_split <- univariate_balanced_accuracy_results %>%
    group_by(Study, Disorder, Analysis_Type, group_var) %>%
    group_split()

pairwise_balanced_accuracy_results_split <- pairwise_balanced_accuracy_results %>%
    group_by(Study, Disorder, Analysis_Type, group_var) %>%
    group_split()

combined_univariate_pairwise_balanced_accuracy_results_split <- combined_univariate_pairwise_balanced_accuracy_results %>%
    group_by(Study, Disorder, Analysis_Type, group_var) %>%
    group_split()

univariate_p_values <- univariate_balanced_accuracy_results_split %>%
    purrr::map_df(~ compute_p_values(observed_balanced_accuracy_df = .x,
                                     null_distribution_df = univariate_null_distribution)) %>%
  # Adjust p-values by group
    group_by(Study, Disorder, Analysis_Type) %>%
    mutate(p_value_HolmBonferroni = p.adjust(p_value, method="holm"),
           p_value_BenjaminiHochberg = p.adjust(p_value, method="BH"))

pairwise_p_values <- pairwise_balanced_accuracy_results_split %>%
    purrr::map_df(~ compute_p_values(observed_balanced_accuracy_df = .x,
                                     null_distribution_df = pairwise_null_distribution)) %>%
  # Adjust p-values by group
    group_by(Study, Disorder, Analysis_Type) %>%
    mutate(p_value_HolmBonferroni = p.adjust(p_value, method="holm"),
           p_value_BenjaminiHochberg = p.adjust(p_value, method="BH"))

combined_univariate_pairwise_p_values <- combined_univariate_pairwise_balanced_accuracy_results_split %>%
    purrr::map_df(~ compute_p_values(observed_balanced_accuracy_df = .x,
                                     null_distribution_df = combined_univariate_pairwise_null_distribution)) %>%
  # Adjust p-values by group
    group_by(Study, Disorder, Analysis_Type) %>%
    mutate(p_value_HolmBonferroni = p.adjust(p_value, method="holm"),
           p_value_BenjaminiHochberg = p.adjust(p_value, method="BH"))


In [None]:
# Save p-values to feather file
univariate_p_values.reset_index().to_feather("data/classification_results/univariate_p_values.feather")
pairwise_p_values.reset_index().to_feather("data/classification_results/pairwise_p_values.feather")
combined_univariate_pairwise_p_values.reset_index().to_feather("data/classification_results/combined_univariate_pairwise_p_values.feather")

## Sensitivity analysis with different classifier types

While all previous analyses used a linear SVM classifier, we also considered the possibility that---given the spatiotemporal complexity of these neuropsychiatric disorders---cases and controls may not be linearly separable in feature space.
We therefore repeated A<sub>region</sub>, A<sub>region</sub>, and A<sub>region</sub> using SVM with a (nonlinear) radial basis function (RBF) kernel and using the random forest ensemble classifier, both from 'scikit-learn'.

In [None]:
# Define CV splitters
main_cv = RepeatedStratifiedKFold(n_splits=num_folds, n_repeats=num_repeats, random_state=127)
inner_cv = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=127)

# Define the model dict
model_dict = {"Linear_SVM_sklearn": svm.SVC(kernel="linear", class_weight="balanced", C=1),
                "Linear_SVM_libsvm": svm.LinearSVC(C=1, dual=False, penalty='l1', class_weight='balanced'),
                "RBF_SVM_sklearn": svm.SVC(kernel="rbf", class_weight="balanced", C=1),
                "RandomForest": RandomForestClassifier(n_estimators=100, class_weight="balanced"),
                "GradientBoost": GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1)}

# Define class weight and C parameter grid for tuning
param_grid={"model__C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
            "model__class_weight": [None, "balanced"]}

for disorder in study_disorder_lookup.keys():
    study = study_disorder_lookup[disorder]

    # Load class labels and sample IDs
    class_labels = np.load(f"data/input_data/{study}_{disorder}_class_labels.npy", allow_pickle=True).tolist()
    sample_IDs = np.load(f"data/input_data/{study}_{disorder}_sample_IDs.npy", allow_pickle=True).tolist()
    
    study_disorder_univariate_models = pd.read_table(f"data/time_series_features/processed_numpy_files/{study}_{disorder}_univariate_models.txt",
                                                     header=None)
    study_disorder_univariate_models.columns = ["Model_Name"]

    for model in study_disorder_univariate_models.Model_Name:

        # Define analysis type
        if "ROI" in model:
            Analysis_Type = "Brain_Region"
        elif "combo_catch25_features_all_regions" in model:
            Analysis_Type = "Univariate_Combo"
        else:
            Analysis_Type = "catch25_feature"
        # Define grouping var
        if Analysis_Type=="Brain_Region":
            grouping_var = model.split("_ROI_")[1]
        elif Analysis_Type=="Univariate_Combo":
            grouping_var = "Combo"
        else:
            grouping_var = model.split("_catch25_feature_")[1]

        # Read in the tabular data for the corresponding model
        model_data = np.load(f"data/time_series_features/processed_numpy_files/{model}.npy")

        # Define main output data file for this feature
        robustness_output_file_base = f"{dataset_ID}_{disorder}_{Analysis_Type}_{grouping_var}"

        # Run the robustness analysis
        (training_balacc_df, classifier_type_df, nested_CV_df) = robustness_analysis(model_data, class_labels, model_dict, inner_cv, main_cv, 
                                                                                    num_folds=num_folds, num_repeats=num_repeats, 
                                                                                    base_model_name="Linear_SVM_sklearn", 
                                                                                    scoring="balanced_accuracy", num_jobs=num_jobs)
        
        # Assign Analysis_Type, group_var, Disorder, and Dataset columns
        for df in [training_balacc_df, classifier_type_df, nested_CV_df]:
            df["Analysis_Type"] = Analysis_Type
            df["group_var"] = grouping_var
            df["Disorder"] = disorder
            df["Dataset"] = study

        # Save results to feather files
        training_balacc_df.to_feather(f"data/classification_results/robustness_analysis/{study}_{disorder}/{robustness_output_file_base}_training_balacc_df.feather")
        classifier_type_df.to_feather(f"data/classification_results/robustness_analysis/{study}_{disorder}/{robustness_output_file_base}_classifier_type_df.feather")
        nested_CV_df.to_feather(f"data/classification_results/robustness_analysis/{study}_{disorder}/{robustness_output_file_base}_nested_CV_df.feather")


Results from these nonlinear classifiers can be tabulated as follows:

In [6]:
classifier_types_result_list = []
nested_CV_results_list = []
training_balacc_results_list = []

# Iterate over the four disorders
for disorder in study_disorder_lookup.keys():
    study = study_disorder_lookup[disorder]

    # Find classifier type result files
    classifier_type_result_files = [f for f in os.listdir(f"data/classification_results/robustness_analysis/{study}_{disorder}/") if f.endswith("classifier_type_df.feather")]

    for classifier_type_result in classifier_type_result_files: 
        # Read in classifier type results
        classifier_type_results = (pd.read_feather(f"data/classification_results/robustness_analysis/{study}_{disorder}/{classifier_type_result}")
                          .assign(Study=study, Disorder=disorder))
        classifier_types_result_list.append(classifier_type_results.reset_index(drop=True))
    
    # Find nested CV result files
    nested_CV_result_files = [f for f in os.listdir(f"data/classification_results/robustness_analysis/{study}_{disorder}/") if f.endswith("nested_CV_df.feather")]

    for nested_CV_result in nested_CV_result_files:
        # Read in nested CV results
        nested_CV_results = (pd.read_feather(f"data/classification_results/robustness_analysis/{study}_{disorder}/{nested_CV_result}")
                          .assign(Study=study, Disorder=disorder))
        nested_CV_results_list.append(nested_CV_results.reset_index(drop=True))

    # Find the training balanced accuracy result files
    training_balacc_result_files = [f for f in os.listdir(f"data/classification_results/robustness_analysis/{study}_{disorder}/") if f.endswith("training_balacc_df.feather")]

    for training_balacc_result in training_balacc_result_files:
        # Read in training balanced accuracy results
        training_balacc_results = (pd.read_feather(f"data/classification_results/robustness_analysis/{study}_{disorder}/{training_balacc_result}")
                          .assign(Study=study, Disorder=disorder))
        training_balacc_results_list.append(training_balacc_results.reset_index(drop=True))

# Concatenate results
classifier_types_results = pd.concat(classifier_types_result_list).reset_index(drop=True)
nested_CV_results = pd.concat(nested_CV_results_list).reset_index(drop=True)
training_balacc_results = pd.concat(training_balacc_results_list).reset_index(drop=True)

# Save to feather files
classifier_types_results.to_feather("data/classification_results/robustness_analysis/all_classifier_types_results.feather")
nested_CV_results.to_feather("data/classification_results/robustness_analysis/all_nested_CV_results.feather")
training_balacc_results.to_feather("data/classification_results/robustness_analysis/all_training_balacc_results.feather")

# Evaluating the effect of inverse probability weighting

We will also fit linear SVM classifiers with no sample weighting to compare against those fit with inverse probability weighting.

In [None]:
# Run univariate classification for UCLA CNP for each disorder
for disorder in ["SCZ", "BP", "ADHD"]:
    run_univariate_classifier_no_weighting(dataset_ID="UCLA_CNP",
                        data_path=f"{current_path}/data/",
                        metadata=UCLA_CNP_metadata,
                        univariate_feature_data=UCLA_CNP_univariate_features,
                        univariate_first_25_PCs=univariate_combo_first25_PCs,
                        disorder=disorder,
                        univariate_feature_set=univariate_feature_set, 
                        pairwise_feature_set=pairwise_feature_set,
                        classifier_type = classifier_type,
                        num_folds = num_folds,
                        num_repeats = num_repeats,
                        num_jobs = num_jobs,
                        num_null_iters = num_null_iters)

# Run univariate classification for ABIDE
for disorder in ["ASD"]:
    run_univariate_classifier_no_weighting(dataset_ID="ABIDE",
                            data_path=f"{current_path}/data/",
                            metadata=ABIDE_metadata,
                            univariate_feature_data=ABIDE_univariate_features,
                            univariate_first_25_PCs=univariate_combo_first25_PCs,
                            disorder=disorder,
                            univariate_feature_set=univariate_feature_set, 
                            pairwise_feature_set=pairwise_feature_set,
                            classifier_type = classifier_type,
                            num_folds = num_folds,
                            num_repeats = num_repeats,
                            num_jobs = num_jobs,
                            num_null_iters = num_null_iters)

In [7]:
Linear_SVM_no_weighting_balanced_accuracy_list = []

# Iterate over each row of study_to_disorder_df to extract the study and disorder
for index, row in study_to_disorder_df.iterrows():
    study = row['Study']
    disorder = row['Disorder']
    
    # Read in univariate results
    Linear_SVM_no_weighting_univariate_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_Univariate_{univariate_feature_set}_Linear_SVM_no_weighting_balanced_accuracy.feather")
                          .assign(Study=study, Disorder=disorder))
    Linear_SVM_no_weighting_balanced_accuracy_list.append(Linear_SVM_no_weighting_univariate_results.reset_index(drop=True))
    

# Concatenate results into one balanced accuracy dataframe
Linear_SVM_no_weighting_balanced_accuracy = pd.concat(Linear_SVM_no_weighting_balanced_accuracy_list, axis=0).reset_index()

# Save to one feather file
Linear_SVM_no_weighting_balanced_accuracy.to_feather("data/classification_results/all_linear_SVM_no_weighting_balanced_accuracy_results.feather")

## Confound analysis

In [None]:
confounds_balanced_accuracy_list = []

# Iterate over the four disorders
for disorder in study_disorder_lookup.keys():
    study = study_disorder_lookup[disorder]

    class_labels = np.load(f"../../data/input_data/{study}_{disorder}_class_labels.npy")
    num_folds=10
    num_repeats=10

    age_feature = merged_metadata.query("Diagnosis in ['Control', @disorder] & Study == @study").Age.values
    sex_feature = merged_metadata.query("Diagnosis in ['Control', @disorder] & Study == @study").Sex.values 
    head_mvmt_feature = merged_metadata.query("Diagnosis in ['Control', @disorder] & Study == @study").Mean_FD_Power.values

    # Convert M to 0 and F to 1
    sex_feature = np.where(sex_feature == "M", 0, 1)

    SVM_model = svm.SVC(kernel='linear', class_weight='balanced', C=1)
    pipeline = Pipeline([('scaler', MixedSigmoidScaler(unit_variance=True)), 
                            ('model', SVM_model)])
    RepeatedStratifiedKFold_splitter = RepeatedStratifiedKFold(n_splits=num_folds, n_repeats=num_repeats, random_state=127) 

    analysis_type_results = []
    for analysis_type in ["Age", "Sex", "Mean_FD_Power"]:
        if analysis_type == "Age":
            feature_data = age_feature.reshape(-1,1) 
        elif analysis_type == "Sex": 
            feature_data = sex_feature.reshape(-1,1) 
        elif analysis_type == "Mean_FD_Power": 
            feature_data = head_mvmt_feature.reshape(-1,1)

        # Find balanced accuracy for dataset 
        confound_balanced_accuracy = cross_validate(pipeline, feature_data, class_labels, 
                                                    cv=RepeatedStratifiedKFold_splitter, scoring="balanced_accuracy", n_jobs=1,
                                                    return_estimator=False)['test_score']
        
        # Create dataframe
        confound_balanced_accuracy_df = pd.DataFrame({"Study" : study, 
                                                        "Disorder": disorder,
                                                        "Analysis_Type": analysis_type,
                                                        "Balanced_Accuracy": confound_balanced_accuracy})
        
        # Assign folds and repeats 
        confound_balanced_accuracy_df["Fold"] = confound_balanced_accuracy_df.index % num_folds
        confound_balanced_accuracy_df["Repeat"] = confound_balanced_accuracy_df.index // num_repeats

        # Append results to list 
        confounds_balanced_accuracy_list.append(confound_balanced_accuracy_df)

# Concatenate results
confounds_balanced_accuracy_results_all_folds = pd.concat(confounds_balanced_accuracy_list, axis=0)

# Take average across folds per disorder 
confounds_balanced_accuracy_results = (confounds_balanced_accuracy_results_all_folds
                                 .groupby(["Study", "Disorder", "Analysis_Type"], as_index=False)['Balanced_Accuracy']
                                 .agg(['mean', 'std'])
                                 .reset_index()
                                 .rename(columns={"mean": "Balanced_Accuracy", "std": "Balanced_Accuracy_SD"}))


# Save to feather file
confounds_balanced_accuracy_results_all_folds.reset_index().to_feather("data/classification_results/confounds_balanced_accuracy_results_all_folds.feather")
confounds_balanced_accuracy_results.reset_index().to_feather("data/classification_results/confounds_balanced_accuracy_results.feather")

## Site-specific analyses for ABIDE

In [10]:
ABIDE_site_specific_results_list = []
study = "ABIDE" 
disorder = "ASD"

for site_number in [5, 20]:
    # Find classifier type result files
    site_specific_result_files = [f for f in os.listdir(f"data/classification_results/robustness_analysis/{study}_{disorder}/") if f.endswith(f"_Site{site_number}_main_classification_res.feather")]

    for site_specific_result_file in site_specific_result_files: 
        # Read in classifier type results
        site_specific_result = (pd.read_feather(f"data/classification_results/robustness_analysis/{study}_{disorder}/{site_specific_result_file}")
                          .assign(Study=study, Disorder=disorder))
        ABIDE_site_specific_results_list.append(site_specific_result.reset_index(drop=True))

# Concatenate results 
ABIDE_site_specific_results_all_folds = pd.concat(ABIDE_site_specific_results_list).reset_index(drop=True)

# Take average across folds per disorder 
ABIDE_site_specific_results = (ABIDE_site_specific_results_all_folds
                                 .groupby(["Study", "Disorder", "Analysis_Type", "Site_Number", "group_var"], as_index=False)['Balanced_Accuracy']
                                 .agg(['mean', 'std'])
                                 .reset_index()
                                 .rename(columns={"mean": "Balanced_Accuracy", "std": "Balanced_Accuracy_SD"}))

# Save to feather file
ABIDE_site_specific_results_all_folds.reset_index().to_feather("data/classification_results/robustness_analysis/ABIDE_site_specific_balanced_accuracy_all_folds.feather")
ABIDE_site_specific_results.reset_index().to_feather("data/classification_results/robustness_analysis/ABIDE_site_specific_balanced_accuracy.feather")

# Classification with the first 10 PCs per model

In [6]:
first_10_PCs_results_list = []

# Iterate over the four disorders
for disorder in study_disorder_lookup.keys():
    study = study_disorder_lookup[disorder]

    # Find first 10 PCs result files
    first_10_PCs_result_files = [f for f in os.listdir(f"data/classification_results/robustness_analysis/{study}_{disorder}/") if f.endswith("first10_PCs_main_classification_res.feather")]

    for first_10_PCs_result in first_10_PCs_result_files: 
        # Read in classifier type results
        first_10_PCs_results = (pd.read_feather(f"data/classification_results/robustness_analysis/{study}_{disorder}/{first_10_PCs_result}")
                          .assign(Study=study, Disorder=disorder))
        first_10_PCs_results_list.append(first_10_PCs_results.reset_index(drop=True))

# Concatenate results
first_10_PCs_results_all_folds = pd.concat(first_10_PCs_results_list).reset_index(drop=True).assign(Fold = lambda x: x.Fold + 1,
                                                                                                    Repeat = lambda x: x.Repeat + 1)

# Save to feather files
first_10_PCs_results_all_folds.to_feather("data/classification_results/robustness_analysis/all_first_10_PCs_classification_results.feather")