In [1]:
import os
import numpy as np
import nibabel as nib
import pandas as pd
import sys

# Uncomment the next two lines if you're running this on an HPC and want to speed up scikit-learn
# from sklearnex import patch_sklearn
# patch_sklearn()

%load_ext rpy2.ipython

In [2]:
%%R
# Load R packages
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


# Running linear SVMs

This Jupyter notebook includes all code needed to perform 10-repeated 10-fold cross-validation analysis using linear support vector machine (SVM) classifiers for each of the five representations.
First, we will import all functions from the `code/classification_analysis/core_classification_functions.py` script.

In [3]:
# add path to classification analysis functions
sys.path.insert(0, 'code/classification_analysis/')
from core_classification_functions import *
current_path = os.getcwd()

# Load metadata for the two datasets
UCLA_CNP_metadata = pd.read_feather('data/input_data/UCLA_CNP_sample_metadata.feather')
ABIDE_metadata = pd.read_feather('data/input_data/ABIDE_sample_metadata.feather')

# Load univariate time-series feature data for the two datasets
UCLA_CNP_univariate_features = pd.read_feather('data/time_series_features/UCLA_CNP_catch25_filtered.feather')
ABIDE_univariate_features = pd.read_feather('data/time_series_features/ABIDE_catch25_filtered.feather')

# Load pairwise time-series feature data for the two datasets
UCLA_CNP_pairwise_features = pd.read_feather('data/time_series_features/UCLA_CNP_pyspi14_filtered.feather')
# ABIDE_pairwise_features = pd.read_feather('data/time_series_features/ABIDE_pyspi14_filtered.feather')

# Load univariate time-series feature info
univariate_feature_info = pd.read_csv('data/feature_info/univariate_feature_info.csv')
pairwise_feature_info = pd.read_csv('data/feature_info/pairwise_feature_info.csv')

In [4]:
%%R -i UCLA_CNP_univariate_features,ABIDE_univariate_features,UCLA_CNP_metadata,ABIDE_metadata,univariate_feature_info -o first_25_PCs_by_disorder

library(tidyverse)
library(FactoMineR)
library("factoextra")

compute_first_n_PCs <- function(univariate_feature_data, dataset_ID, metadata, disorder, n_PCs) {
  
  data_for_PCA <- univariate_feature_data %>%
    left_join(., metadata) %>%
    filter(Diagnosis %in% c("Control", disorder)) %>%
    mutate(unique_ID = paste0(names, "__", Brain_Region), .keep="unused") %>%
    dplyr::select(unique_ID, Sample_ID, Diagnosis, values) %>%
    pivot_wider(id_cols=c(Sample_ID, Diagnosis), names_from=unique_ID, values_from=values) %>%
    mutate(Diagnosis=factor(Diagnosis, levels=c("Control", disorder)))
  
  # Compute PCA
  pca_res <- PCA(select(data_for_PCA, c(-Sample_ID, -Diagnosis)), ncp=n_PCs, graph = FALSE, scale.unit = TRUE)
  pca_scores <- as.data.frame(pca_res$ind$coord) %>%
    mutate(Sample_ID = data_for_PCA$Sample_ID,
           Diagnosis = data_for_PCA$Diagnosis,
           Disorder = disorder,
           Study = dataset_ID)
  
  return(pca_scores)
}

first_25_PCs_by_disorder_list <- list()
for (disorder in c("Schizophrenia", "Bipolar", "ADHD")) {
  pca_scores_disorder <- compute_first_n_PCs(UCLA_CNP_univariate_features, 
                                             dataset_ID='UCLA_CNP', 
                                             metadata=UCLA_CNP_metadata, 
                                             disorder=disorder, 
                                             n_PCs=25)
  first_25_PCs_by_disorder_list[[disorder]] <- pca_scores_disorder
}
for (disorder in c("ASD")) {
  pca_scores_disorder <- compute_first_n_PCs(ABIDE_univariate_features, 
                                             dataset_ID='ABIDE', 
                                             metadata=ABIDE_metadata, 
                                             disorder=disorder, 
                                             n_PCs=25)
  first_25_PCs_by_disorder_list[[disorder]] <- pca_scores_disorder
}
first_25_PCs_by_disorder = do.call(rbind, first_25_PCs_by_disorder_list)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Joining with `by = join_by(Sample_ID)`
Joining with `by = join_by(Sample_ID)`
Joining with `by = join_by(Sample_ID)`
Joining with `by = join_by(Sample_ID)`


Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
In PCA(select(data_for_PCA, c(-Sample_ID, -Diagnosis)), ncp = n_PCs,  :
  Missing values are imputed by the mean of the variable: you should use the imputePCA function of the missMDA package


In [4]:
# first_25_PCs_by_disorder.reset_index().to_feather("data/time_series_features/univariate_combo_first25_PCs.feather")
first_25_PCs_by_disorder = pd.read_feather("data/time_series_features/univariate_combo_first25_PCs.feather")

In [5]:
# Define parameters that you can change
univariate_feature_set = "catch25"
pairwise_feature_set = "pyspi14"
classifier_type = "Linear_SVM"
SPI_directionality_file = f"{current_path}/code/classification_analysis/SPI_Direction_Info.csv"
num_folds = 10
num_repeats = 10
num_null_iters = 1000
num_jobs = 1 # You can increase this if you are running this on an HPC with multiple cores available

## $\mathrm{A_{region}}$, $\mathrm{A_{feature}}$, and $\mathrm{A_{uni\_combo}}$: Fitting linear SVMs to intra-regional univariate time-series properties

In [None]:
# Run univariate classification for UCLA CNP for each disorder
for disorder in ["Schizophrenia", "Bipolar", "ADHD"]:
    run_univariate_classifier(dataset_ID="UCLA_CNP",
                        data_path=f"{current_path}/data/",
                        metadata=UCLA_CNP_metadata,
                        univariate_feature_data=UCLA_CNP_univariate_features,
                        univariate_first_25_PCs=first_25_PCs_by_disorder,
                        disorder=disorder,
                        univariate_feature_set=univariate_feature_set, 
                        pairwise_feature_set=pairwise_feature_set,
                        classifier_type = classifier_type,
                        num_folds = num_folds,
                        num_repeats = num_repeats,
                        num_jobs = num_jobs,
                        num_null_iters = num_null_iters)

# Run univariate classification for ABIDE
for disorder in ["ASD"]:
    run_univariate_classifier(dataset_ID="ABIDE",
                            data_path=f"{current_path}/data/",
                            metadata=ABIDE_metadata,
                            univariate_feature_data=ABIDE_univariate_features,
                            univariate_first_25_PCs=first_25_PCs_by_disorder,
                            disorder=disorder,
                            univariate_feature_set=univariate_feature_set, 
                            pairwise_feature_set=pairwise_feature_set,
                            classifier_type = classifier_type,
                            num_folds = num_folds,
                            num_repeats = num_repeats,
                            num_jobs = num_jobs,
                            num_null_iters = num_null_iters)

After running this code, you should have the following folders in your `data/classification_results/` directory:
- `balanced_accuracy/`: contains individual `.feather` files with the fold-wise balanced accuracy results for each disorder
- `fold_assignments/`: contains individual `.feather` files with the fold assignments for each sample
- `null_distributions/`: contains individual `.feather` files with the null balanced accuracy distributions for each disorder
- `sample_predictions/`: contains individual `.feather` files with the proportion of folds (out of 10 repeats) in which each sample was correctly predicted out-of-sample

## $\mathrm{A_{FC}}$ and $\mathrm{A_{FC\_combo}}$: Fitting linear SVMs to inter-regional bivariate time-series properties

In [None]:

# Run pairwise classification for UCLA CNP for each disorder
for disorder in ["Schizophrenia", "Bipolar", "ADHD"]:
    run_pairwise_classifier_by_SPI(dataset_ID="UCLA_CNP",
                        data_path=f"{current_path}/data/",
                        SPI_directionality_file=SPI_directionality_file,
                        metadata=UCLA_CNP_metadata,
                        pairwise_feature_data=UCLA_CNP_pairwise_features,
                        disorder=disorder,
                        univariate_feature_set=univariate_feature_set, 
                        pairwise_feature_set=pairwise_feature_set,
                        classifier_type = classifier_type,
                        num_folds = num_folds,
                        num_repeats = num_repeats,
                        num_jobs = num_jobs,
                        num_null_iters = num_null_iters)
    run_combined_uni_pairwise_classifier_by_SPI(dataset_ID="UCLA_CNP",
                                                data_path=f"{current_path}/data/",
                                                SPI_directionality_file=SPI_directionality_file,
                                                metadata=UCLA_CNP_metadata,
                                                univariate_feature_data=UCLA_CNP_univariate_features,
                                                pairwise_feature_data=UCLA_CNP_pairwise_features,
                                                disorder=disorder,
                                                univariate_feature_set=univariate_feature_set, 
                                                pairwise_feature_set=pairwise_feature_set,
                                                classifier_type = classifier_type,
                                                num_folds = num_folds,
                                                num_repeats = num_repeats,
                                                num_jobs = num_jobs,
                                                num_null_iters = num_null_iters)

# Run univariate classification for ABIDE
for disorder in ["ASD"]:
    run_pairwise_classifier_by_SPI(dataset_ID="ABIDE",
                            data_path=f"{current_path}/data/",
                            SPI_directionality_file=SPI_directionality_file,
                            metadata=ABIDE_metadata,
                            pairwise_feature_data=ABIDE_pairwise_features,
                            disorder=disorder,
                            univariate_feature_set=univariate_feature_set, 
                            pairwise_feature_set=pairwise_feature_set,
                            classifier_type = classifier_type,
                            num_folds = num_folds,
                            num_repeats = num_repeats,
                            num_jobs = num_jobs,
                            num_null_iters = num_null_iters)
    run_combined_uni_pairwise_classifier_by_SPI(dataset_ID="ABIDE",
                                                data_path=f"{current_path}/data/",
                                                SPI_directionality_file=SPI_directionality_file,
                                                metadata=ABIDE_metadata,
                                                univariate_feature_data=ABIDE_univariate_features,
                                                pairwise_feature_data=ABIDE_pairwise_features,
                                                disorder=disorder,
                                                univariate_feature_set=univariate_feature_set, 
                                                pairwise_feature_set=pairwise_feature_set,
                                                classifier_type = classifier_type,
                                                num_folds = num_folds,
                                                num_repeats = num_repeats,
                                                num_jobs = num_jobs,
                                                num_null_iters = num_null_iters)

## Prepping results for visualization

Now that we have fit all linear SVMs and null models for each of the five representations, we can tabulate data into a format that is amenable to statistical analysis and data visualization.

First, we will compile balanced accuracy and null results that were saved separately across disorders into one feather file.

In [6]:
# Create a simple dataframe mapping study to disorder and disorder shorthand name
study_to_disorder_df = pd.DataFrame({'Study': ['UCLA_CNP', 'UCLA_CNP', 'UCLA_CNP', 'ABIDE'],
                                  'Disorder': ['Schizophrenia', 'Bipolar', 'ADHD', 'ASD'],
                                  'Disorder_Short': ['SCZ', 'BP', 'ADHD', 'ASD']})


balanced_accuracy_results_list = []

# Iterate over each row of study_to_disorder_df to extract the study and disorder
for index, row in study_to_disorder_df.iterrows():
    study = row['Study']
    disorder = row['Disorder']
    disorder_short = row['Disorder_Short']
    
    # Read in univariate results
    univariate_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_Univariate_{univariate_feature_set}_Linear_SVM_balanced_accuracy.feather")
                          .assign(Study=study, Disorder=disorder_short))
    balanced_accuracy_results_list.append(univariate_results)
    
    # Read in pairwise results
    pairwise_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_Pairwise_{pairwise_feature_set}_Linear_SVM_balanced_accuracy.feather")
                        .assign(Study=study, Disorder=disorder_short))
    balanced_accuracy_results_list.append(pairwise_results)
    
    # Read in combined univariate and pairwise results
    combined_uni_pairwise_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_Univariate_{univariate_feature_set}_Pairwise_pyspi14_Linear_SVM_balanced_accuracy.feather")
                                        .assign(Study=study, Disorder=disorder_short))
    balanced_accuracy_results_list.append(combined_uni_pairwise_results)

# Concatenate results into one balanced accuracy dataframe
balanced_accuracy_results = pd.concat(balanced_accuracy_results_list, axis=0).reset_index()

# Save to one feather file
balanced_accuracy_results.to_feather("data/classification_results/all_balanced_accuracy_results.feather")
    

In [27]:
# We can also compile the results for L1-regularized SVM and PCA-based SVM across the four disorders
L1_regularized_balanced_accuracy_list = []
PCA_balanced_accuracy_list = []

# Iterate over each row of study_to_disorder_df to extract the study and disorder
for index, row in study_to_disorder_df.iterrows():
    study = row['Study']
    disorder = row['Disorder']
    disorder_short = row['Disorder_Short']
    
    # Read in L1-regularized SVM results
    L1_regularized_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_Univariate_{univariate_feature_set}_Combo_L1_Regularized_balanced_accuracy.feather")
                          .assign(Study=study, Disorder=disorder_short))
    L1_regularized_balanced_accuracy_list.append(L1_regularized_results)
    
    # Read in PCA-based SVM results
    PCA_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_Univariate_{univariate_feature_set}_Combo_25_PCs_balanced_accuracy.feather")
                        .assign(Study=study, Disorder=disorder_short))
    PCA_balanced_accuracy_list.append(PCA_results)

# Concatenate results into one balanced accuracy dataframe
L1_regularized_balanced_accuracy_results = (pd.concat(L1_regularized_balanced_accuracy_list, axis=0)
                                            .reset_index(level=0, drop=True)
                                            .drop(columns=['index', 'level_0']))
PCA_balanced_accuracy_results = (pd.concat(PCA_balanced_accuracy_list, axis=0)
                                            .reset_index(level=0, drop=True)
                                            .drop(columns=['index', 'level_0']))

# Save to one feather file
L1_regularized_balanced_accuracy_results.to_feather("data/classification_results/all_L1_regularized_balanced_accuracy_results.feather")
PCA_balanced_accuracy_results.to_feather("data/classification_results/all_PCA_balanced_accuracy_results.feather")

We'll do the same compilation for fold assignments and null balanced accuracy distributions:

In [9]:

fold_assignment_results_list = []
null_distribution_list = []

# Iterate over each row of study_to_disorder_df to extract the study and disorder
for index, row in study_to_disorder_df.iterrows():
    study = row['Study']
    disorder = row['Disorder']
    disorder_short = row['Disorder_Short']

    # Read in fold assignment results for univariate, pairwise, and combo
    fold_assignment_results_univariate = (pd.read_feather(f"data/classification_results/fold_assignments/{study}_{disorder}_Univariate_{univariate_feature_set}_Linear_SVM_fold_assignments.feather")
                               .assign(Study=study, Disorder=disorder_short))
    fold_assignment_results_list.append(fold_assignment_results_univariate)
    fold_assignment_results_pairwise = (pd.read_feather(f"data/classification_results/fold_assignments/{study}_{disorder}_Pairwise_{pairwise_feature_set}_Linear_SVM_fold_assignments.feather")
                               .assign(Study=study, Disorder=disorder_short))
    fold_assignment_results_list.append(fold_assignment_results_pairwise)
    fold_assignment_results_combo = (pd.read_feather(f"data/classification_results/fold_assignments/{study}_{disorder}_Univariate_{univariate_feature_set}_Pairwise_pyspi14_Linear_SVM_fold_assignments.feather")
                               .assign(Study=study, Disorder=disorder_short))
    
    # Read in null distribution results
    null_distribution_univariate = (pd.read_feather(f"data/classification_results/null_distributions/{study}_{disorder}_Univariate_{univariate_feature_set}_Linear_SVM_null_balanced_accuracy_distributions.feather")
                         .assign(Study=study, Disorder=disorder_short))
    null_distribution_list.append(null_distribution_univariate)
    null_distribution_pairwise = (pd.read_feather(f"data/classification_results/null_distributions/{study}_{disorder}_Pairwise_{pairwise_feature_set}_Linear_SVM_null_balanced_accuracy_distributions.feather")
                         .assign(Study=study, Disorder=disorder_short))
    null_distribution_list.append(null_distribution_pairwise)
    null_distribution_combo = (pd.read_feather(f"data/classification_results/null_distributions/{study}_{disorder}_Univariate_{univariate_feature_set}_Pairwise_pyspi14_Linear_SVM_null_balanced_accuracy_distributions.feather")
                         .assign(Study=study, Disorder=disorder_short))
    null_distribution_list.append(null_distribution_combo)

# Concatenate results into one dataframe
fold_assignment_results = pd.concat(fold_assignment_results_list, axis=0).reset_index()
null_distribution_results = pd.concat(null_distribution_list, axis=0).reset_index()

# Save to one feather file
fold_assignment_results.to_feather("data/classification_results/all_fold_assignments.feather")
null_distribution_results.to_feather("data/classification_results/all_null_distributions.feather")


Next, we'll define a function to compute a $p$-value for a given observed balanced accuracy based on the corresponding empirical null distribution, comprising 1000 null balanced accuracy estimates for the given model.
From this distribution, we will derive the mean and SD null balanced accuracy, which will be used to compute the cumulative density function for the corresponding Gaussian distribution to obtain a p-value.

In [10]:
%%R -o compute_p_values,study_to_disorder_df

# Functions to calculate empirical p-value
compute_p_values <- function(observed_balanced_accuracy_df, null_distribution_df) {
  
  # Filter null to the same analysis type and grouping variable
  null_distribution_df <- null_distribution_df %>%
      dplyr::select(Analysis_Type, Disorder, Study, Null_Balanced_Accuracy, group_var) %>%
      semi_join(., observed_balanced_accuracy_df %>% dplyr::select(Analysis_Type, Disorder, Study, Balanced_Accuracy_Across_Folds, group_var))

  # Compare main balanced accuracy with that of the empirical null distribution
  observed_balanced_accuracy_value <- observed_balanced_accuracy_df$Balanced_Accuracy_Across_Folds
  
  # Extract the mean and variance from the null distribution
  null_mean <- mean(null_distribution_df$Null_Balanced_Accuracy)
  null_SD <- sd(null_distribution_df$Null_Balanced_Accuracy)
  
  # Compute the probability of observing the main balanced accuracy given a null Gaussian distribution with the above parameters
  p_value <- pnorm(q=observed_balanced_accuracy_value, mean=null_mean, sd=null_SD, lower.tail = FALSE)
  
  # Organize results into dataframe to return
  observed_balanced_accuracy_df$p_value <- p_value
  return(observed_balanced_accuracy_df)
}

# Create a simple dataframe mapping study to disorder and disorder shorthand name
study_to_disorder_df <- data.frame(Study=c("UCLA_CNP", "UCLA_CNP", "UCLA_CNP", "ABIDE"),
                                   Disorder=c("Schizophrenia", "Bipolar", "ADHD", "ASD"),
                                   Disorder_Short=c("SCZ", "BP", "ADHD", "ASD"))


Now, we will compute $p$-values for all balanced accuracy results and save the results to a feather file.

In [11]:
%%R -i balanced_accuracy_results,study_to_disorder_df,compute_p_values,null_distribution_results -o balanced_accuracy_results_across_folds,all_p_values

# Aggregate balanced accuracy results across folds
balanced_accuracy_results_across_folds <- balanced_accuracy_results %>%
    group_by(Study, Disorder, Analysis_Type, group_var) %>%
    summarize(Balanced_Accuracy_Across_Folds = mean(Balanced_Accuracy, na.rm=T),
              Balanced_Accuracy_Across_Folds_SD = sd(Balanced_Accuracy, na.rm=T))

# Split the balanced accuracy results by study, disorder, and analysis type
balanced_accuracy_results_split <- balanced_accuracy_results_across_folds %>%
    group_by(Study, Disorder, Analysis_Type, group_var) %>%
    group_split()

all_p_values <- balanced_accuracy_results_split %>%
    purrr::map_df(~ compute_p_values(observed_balanced_accuracy_df = .x,
                                     null_distribution_df = null_distribution_results)) %>%
  # Adjust p-values by group
    group_by(Study, Disorder, Analysis_Type) %>%
    mutate(p_value_HolmBonferroni = p.adjust(p_value, method="holm"))




`summarise()` has grouped output by 'Study', 'Disorder', 'Analysis_Type'. You
can override using the `.groups` argument.
Joining with `by = join_by(Analysis_Type, Disorder, Study, group_var)`
Joining with `by = join_by(Analysis_Type, Disorder, Study, group_var)`
Joining with `by = join_by(Analysis_Type, Disorder, Study, group_var)`
Joining with `by = join_by(Analysis_Type, Disorder, Study, group_var)`
Joining with `by = join_by(Analysis_Type, Disorder, Study, group_var)`
Joining with `by = join_by(Analysis_Type, Disorder, Study, group_var)`
Joining with `by = join_by(Analysis_Type, Disorder, Study, group_var)`
Joining with `by = join_by(Analysis_Type, Disorder, Study, group_var)`
Joining with `by = join_by(Analysis_Type, Disorder, Study, group_var)`
Joining with `by = join_by(Analysis_Type, Disorder, Study, group_var)`
Joining with `by = join_by(Analysis_Type, Disorder, Study, group_var)`
Joining with `by = join_by(Analysis_Type, Disorder, Study, group_var)`
Joining with `by = join_by(

In [12]:
# Save p-values to feather file
all_p_values.reset_index().to_feather("data/classification_results/all_p_values.feather")