In [1]:
import os
import numpy as np
import nibabel as nib
import pandas as pd
import sys

# Uncomment the next two lines if you're running this on an HPC and want to speed up scikit-learn
# from sklearnex import patch_sklearn
# patch_sklearn()

%load_ext rpy2.ipython

In [2]:
%%R
# Load tidyverse R package
suppressPackageStartupMessages({
    library(tidyverse)
})

# Running linear SVMs

This Jupyter notebook includes all code needed to perform 10-repeated 10-fold cross-validation analysis using linear support vector machine (SVM) classifiers for each of the five representations.
First, we will import all functions from the `code/classification_analysis/core_classification_functions.py` script.

In [3]:
# add path to classification analysis functions
sys.path.insert(0, 'code/classification_analysis/')
from core_classification_functions import *
current_path = os.getcwd()

study_to_disorder_df = pd.DataFrame({'Study': ['UCLA_CNP', 'UCLA_CNP', 'UCLA_CNP', 'ABIDE'],
                                    'Disorder': ['SCZ', 'BP', 'ADHD', 'ASD']})

# Load metadata for the two datasets
UCLA_CNP_metadata = pd.read_feather('data/input_data/UCLA_CNP_sample_metadata.feather')
ABIDE_metadata = pd.read_feather('data/input_data/ABIDE_sample_metadata.feather')


In [None]:


# Load univariate time-series feature data for the two datasets
UCLA_CNP_univariate_features = pd.read_feather('data/time_series_features/UCLA_CNP_catch25_filtered.feather')
ABIDE_univariate_features = pd.read_feather('data/time_series_features/ABIDE_catch25_filtered.feather')

# Load pairwise time-series feature data for the two datasets
UCLA_CNP_pairwise_features = pd.read_feather('data/time_series_features/UCLA_CNP_pyspi14_filtered.feather')
ABIDE_pairwise_features = pd.read_feather('data/time_series_features/ABIDE_pyspi14_filtered.feather')

# Load first 25 principal components for univariate region times feature combination matrices
univariate_combo_first25_PCs = pd.read_feather('data/time_series_features/univariate_combo_first25_PCs.feather')

# Load univariate time-series feature info
univariate_feature_info = pd.read_csv('data/feature_info/univariate_feature_info.csv')
pairwise_feature_info = pd.read_csv('data/feature_info/pairwise_feature_info.csv')


In [4]:
# Define parameters that you can change
univariate_feature_set = "catch25"
pairwise_feature_set = "pyspi14"
classifier_type = "Linear_SVM"
SPI_directionality_file = f"{current_path}/code/classification_analysis/SPI_Direction_Info.csv"
num_folds = 10
num_repeats = 10
num_null_iters = 1000
num_jobs = 1 # You can increase this if you are running this on an HPC with multiple cores available

## $\mathrm{A_{region}}$, $\mathrm{A_{feature}}$, and $\mathrm{A_{uni\_combo}}$: Fitting linear SVMs to intra-regional univariate time-series properties

In [None]:
# Run univariate classification for UCLA CNP for each disorder
for disorder in ["SCZ", "BP", "ADHD"]:
    run_univariate_classifier(dataset_ID="UCLA_CNP",
                        data_path=data_path,
                        metadata=UCLA_CNP_metadata,
                        univariate_feature_data=UCLA_CNP_univariate_features,
                        univariate_first_25_PCs=univariate_combo_first25_PCs,
                        disorder=disorder,
                        univariate_feature_set=univariate_feature_set, 
                        pairwise_feature_set=pairwise_feature_set,
                        classifier_type = classifier_type,
                        num_folds = num_folds,
                        num_repeats = num_repeats,
                        num_jobs = num_jobs,
                        num_null_iters = num_null_iters)

# Run univariate classification for ABIDE
for disorder in ["ASD"]:
    run_univariate_classifier(dataset_ID="ABIDE",
                            data_path=data_path,
                            metadata=ABIDE_metadata,
                            univariate_feature_data=ABIDE_univariate_features,
                            univariate_first_25_PCs=univariate_combo_first25_PCs,
                            disorder=disorder,
                            univariate_feature_set=univariate_feature_set, 
                            pairwise_feature_set=pairwise_feature_set,
                            classifier_type = classifier_type,
                            num_folds = num_folds,
                            num_repeats = num_repeats,
                            num_jobs = num_jobs,
                            num_null_iters = num_null_iters)

After running this code, you should have the following folders in your `data/classification_results/` directory:
- `balanced_accuracy/`: contains individual `.feather` files with the fold-wise balanced accuracy results for each disorder
- `fold_assignments/`: contains individual `.feather` files with the fold assignments for each train/test fold per model
- `null_distributions/`: contains individual `.feather` files with the null balanced accuracy distributions for each disorder

## $\mathrm{A_{FC}}$ and $\mathrm{A_{FC\_combo}}$: Fitting linear SVMs to inter-regional bivariate time-series properties

In [None]:

# Run pairwise classification for UCLA CNP for each disorder
for disorder in ["SCZ", "BP", "ADHD"]:
    run_pairwise_classifier_by_SPI(dataset_ID="UCLA_CNP",
                        data_path=f"{current_path}/data/",
                        SPI_directionality_file=SPI_directionality_file,
                        metadata=UCLA_CNP_metadata,
                        pairwise_feature_data=UCLA_CNP_pairwise_features,
                        disorder=disorder,
                        univariate_feature_set=univariate_feature_set, 
                        pairwise_feature_set=pairwise_feature_set,
                        classifier_type = classifier_type,
                        num_folds = num_folds,
                        num_repeats = num_repeats,
                        num_jobs = num_jobs,
                        num_null_iters = num_null_iters)
    run_combined_univariate_pairwise_classifier_by_SPI(dataset_ID="UCLA_CNP",
                                                data_path=f"{current_path}/data/",
                                                SPI_directionality_file=SPI_directionality_file,
                                                metadata=UCLA_CNP_metadata,
                                                univariate_feature_data=UCLA_CNP_univariate_features,
                                                pairwise_feature_data=UCLA_CNP_pairwise_features,
                                                disorder=disorder,
                                                univariate_feature_set=univariate_feature_set, 
                                                pairwise_feature_set=pairwise_feature_set,
                                                classifier_type = classifier_type,
                                                num_folds = num_folds,
                                                num_repeats = num_repeats,
                                                num_jobs = num_jobs,
                                                num_null_iters = num_null_iters)

# Run univariate classification for ABIDE
for disorder in ["ASD"]:
    run_pairwise_classifier_by_SPI(dataset_ID="ABIDE",
                            data_path=f"{current_path}/data/",
                            SPI_directionality_file=SPI_directionality_file,
                            metadata=ABIDE_metadata,
                            pairwise_feature_data=ABIDE_pairwise_features,
                            disorder=disorder,
                            univariate_feature_set=univariate_feature_set, 
                            pairwise_feature_set=pairwise_feature_set,
                            classifier_type = classifier_type,
                            num_folds = num_folds,
                            num_repeats = num_repeats,
                            num_jobs = num_jobs,
                            num_null_iters = num_null_iters)
    run_combined_univariate_pairwise_classifier_by_SPI(dataset_ID="ABIDE",
                                                data_path=f"{current_path}/data/",
                                                SPI_directionality_file=SPI_directionality_file,
                                                metadata=ABIDE_metadata,
                                                univariate_feature_data=ABIDE_univariate_features,
                                                pairwise_feature_data=ABIDE_pairwise_features,
                                                disorder=disorder,
                                                univariate_feature_set=univariate_feature_set, 
                                                pairwise_feature_set=pairwise_feature_set,
                                                classifier_type = classifier_type,
                                                num_folds = num_folds,
                                                num_repeats = num_repeats,
                                                num_jobs = num_jobs,
                                                num_null_iters = num_null_iters)

## Prepping results for visualization

Now that we have fit all linear SVMs and null models for each of the five representations, we can tabulate data into a format that is amenable to statistical analysis and data visualization.

First, we will compile balanced accuracy and null results that were saved separately across disorders into one feather file.

In [6]:
SCZ_pairwise = pd.read_feather("/Users/abry4213/github/fMRI_FeaturesDisorders/data/classification_results/balanced_accuracy/UCLA_CNP_SCZ_pairwise_Linear_SVM_sklearn_balanced_accuracy.feather")

In [5]:
univariate_balanced_accuracy_results_list = []
pairwise_balanced_accuracy_results_list = []
combined_univariate_balanced_accuracy_pairwise_results_list = []

# Lists for all folds
univariate_balanced_accuracy_results_all_folds_list = []
pairwise_balanced_accuracy_results_all_folds_list = []
combined_univariate_balanced_accuracy_pairwise_results_all_folds_list = []

# Iterate over each row of study_to_disorder_df to extract the study and disorder
for index, row in study_to_disorder_df.iterrows():
    study = row['Study']
    disorder = row['Disorder']
    
    # Read in univariate results
    univariate_balanced_accuracy_results_disorder = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_univariate_Linear_SVM_sklearn_balanced_accuracy.feather")
                          .assign(Study=study, Disorder=disorder))
    univariate_balanced_accuracy_results_list.append(univariate_balanced_accuracy_results_disorder)
    
    univariate_balanced_accuracy_results_all_folds = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_univariate_Linear_SVM_sklearn_balanced_accuracy_all_folds.feather")
                          .assign(Study=study, Disorder=disorder))
    univariate_balanced_accuracy_results_all_folds_list.append(univariate_balanced_accuracy_results_all_folds)
    
    # Read in pairwise results and set Disorder to Comparison_Group if Disorder is NA
    pairwise_balanced_accuracy_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_pairwise_Linear_SVM_sklearn_balanced_accuracy.feather")
                        .assign(Study=study, Disorder=disorder))
    pairwise_balanced_accuracy_results_list.append(pairwise_balanced_accuracy_results)

    pairwise_balanced_accuracy_results_all_folds = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_pairwise_Linear_SVM_sklearn_balanced_accuracy_all_folds.feather")
                        .assign(Study=study, Disorder=disorder))
    pairwise_balanced_accuracy_results_all_folds_list.append(pairwise_balanced_accuracy_results_all_folds)
    
    # Read in combined univariate and pairwise results
    combined_univariate_balanced_accuracy_pairwise_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_combined_univariate_pairwise_Linear_SVM_sklearn_balanced_accuracy.feather")
                                        .assign(Study=study, Disorder=disorder))
    combined_univariate_balanced_accuracy_pairwise_results_list.append(combined_univariate_balanced_accuracy_pairwise_results)

    combined_univariate_balanced_accuracy_pairwise_results_all_folds = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_combined_univariate_pairwise_Linear_SVM_sklearn_balanced_accuracy_all_folds.feather")
                                        .assign(Study=study, Disorder=disorder))
    combined_univariate_balanced_accuracy_pairwise_results_all_folds_list.append(combined_univariate_balanced_accuracy_pairwise_results_all_folds)

# Concatenate results
univariate_balanced_accuracy_results = pd.concat(univariate_balanced_accuracy_results_list).reset_index(drop=True)
univariate_balanced_accuracy_results_all_folds = pd.concat(univariate_balanced_accuracy_results_all_folds_list).reset_index(drop=True)
pairwise_balanced_accuracy_results = pd.concat(pairwise_balanced_accuracy_results_list).reset_index(drop=True)
pairwise_balanced_accuracy_results_all_folds = pd.concat(pairwise_balanced_accuracy_results_all_folds_list).reset_index(drop=True)
combined_univariate_balanced_accuracy_pairwise_results = pd.concat(combined_univariate_balanced_accuracy_pairwise_results_list).reset_index(drop=True)
combined_univariate_balanced_accuracy_pairwise_results_all_folds = pd.concat(combined_univariate_balanced_accuracy_pairwise_results_all_folds_list).reset_index(drop=True)

# Save p-values to feather file
univariate_balanced_accuracy_results.reset_index().to_feather("data/classification_results/univariate_balanced_accuracy_results.feather")
univariate_balanced_accuracy_results_all_folds.reset_index().to_feather("data/classification_results/univariate_balanced_accuracy_results_all_folds.feather")
pairwise_balanced_accuracy_results.reset_index().to_feather("data/classification_results/pairwise_balanced_accuracy_results.feather")
pairwise_balanced_accuracy_results_all_folds.reset_index().to_feather("data/classification_results/pairwise_balanced_accuracy_results_all_folds.feather")
combined_univariate_balanced_accuracy_pairwise_results.reset_index().to_feather("data/classification_results/combined_univariate_balanced_accuracy_results.feather")
combined_univariate_balanced_accuracy_pairwise_results_all_folds.reset_index().to_feather("data/classification_results/combined_univariate_balanced_accuracy_results_all_folds.feather")

In [8]:
# We can also compile the results for L1-regularized SVM and PCA-based SVM across the four disorders
L1_regularized_balanced_accuracy_list = []
PCA_balanced_accuracy_list = []

# Iterate over each row of study_to_disorder_df to extract the study and disorder
for index, row in study_to_disorder_df.iterrows():
    study = row['Study']
    disorder = row['Disorder']
    
    # Read in L1-regularized SVM results
    L1_regularized_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_Univariate_{univariate_feature_set}_{classifier_type}_Combo_L1_Regularized_balanced_accuracy.feather")
                          .assign(Study=study, Disorder=disorder))
    L1_regularized_balanced_accuracy_list.append(L1_regularized_results)
    
    # Read in PCA-based SVM results
    PCA_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_Univariate_{univariate_feature_set}_{classifier_type}_Combo_25_PCs_balanced_accuracy.feather")
                        .assign(Study=study, Disorder=disorder))
    PCA_balanced_accuracy_list.append(PCA_results)

# Concatenate results into one balanced accuracy dataframe
L1_regularized_balanced_accuracy_results = (pd.concat(L1_regularized_balanced_accuracy_list, axis=0)
                                            .reset_index(level=0, drop=True)
                                            .drop(columns=['index', 'level_0']))
PCA_balanced_accuracy_results = (pd.concat(PCA_balanced_accuracy_list, axis=0)
                                            .reset_index(level=0, drop=True)
                                            .drop(columns=['index', 'level_0']))

# Save to one feather file
L1_regularized_balanced_accuracy_results.to_feather("data/classification_results/all_L1_regularized_balanced_accuracy_results.feather")
PCA_balanced_accuracy_results.to_feather("data/classification_results/all_PCA_balanced_accuracy_results.feather")

We'll do the same compilation for fold assignments and null balanced accuracy distributions:

In [8]:
univariate_null_distribution_list = []
pairwise_null_distribution_list = []
combined_univariate_pairwise_null_distribution_list = []

# Iterate over each row of study_to_disorder_df to extract the study and disorder
for index, row in study_to_disorder_df.iterrows():
    study = row['Study']
    disorder = row['Disorder']
    
    # Read in null distribution results
    null_distribution_univariate = (pd.read_feather(f"data/classification_results/null_results/{study}_{disorder}_univariate_Linear_SVM_sklearn_null_balanced_accuracy_distributions.feather")
                         .assign(Study=study, Disorder=disorder))
    univariate_null_distribution_list.append(null_distribution_univariate)

    null_distribution_pairwise = (pd.read_feather(f"data/classification_results/null_results/{study}_{disorder}_pairwise_Linear_SVM_sklearn_null_balanced_accuracy_distributions.feather")
                         .assign(Study=study, Disorder=disorder))
    pairwise_null_distribution_list.append(null_distribution_pairwise)

    null_distribution_combo = (pd.read_feather(f"data/classification_results/null_results/{study}_{disorder}_combined_univariate_pairwise_Linear_SVM_sklearn_null_balanced_accuracy_distributions.feather")
                         .assign(Study=study, Disorder=disorder))
    combined_univariate_pairwise_null_distribution_list.append(null_distribution_combo)

# Concatenate results into one dataframe
univariate_null_distribution = pd.concat(univariate_null_distribution_list, axis=0).reset_index()
pairwise_null_distribution = pd.concat(pairwise_null_distribution_list, axis=0).reset_index()
combined_univariate_pairwise_null_distribution = pd.concat(combined_univariate_pairwise_null_distribution_list, axis=0).reset_index()


Next, we'll define a function to compute a $p$-value for a given observed balanced accuracy based on the corresponding empirical null distribution, comprising 1000 null balanced accuracy estimates for the given model.
From this distribution, we will derive the mean and SD null balanced accuracy, which will be used to compute the cumulative density function for the corresponding Gaussian distribution to obtain a p-value.

In [9]:
%%R -o compute_p_values

# Functions to calculate empirical p-value
compute_p_values <- function(observed_balanced_accuracy_df, null_distribution_df) {
  
  # Filter null to the same analysis type and grouping variable
  null_distribution_df <- null_distribution_df %>%
      dplyr::select(Analysis_Type, Disorder, Study, Balanced_Accuracy, group_var) %>%
      semi_join(., observed_balanced_accuracy_df %>% dplyr::select(Analysis_Type, Disorder, Study, Balanced_Accuracy, group_var),
                by = join_by(Analysis_Type, Disorder, Study, group_var))

  # Compare main balanced accuracy with that of the empirical null distribution
  observed_balanced_accuracy_value <- observed_balanced_accuracy_df$Balanced_Accuracy
  
  # Extract the mean and variance from the null distribution
  null_mean <- mean(null_distribution_df$Balanced_Accuracy)
  null_SD <- sd(null_distribution_df$Balanced_Accuracy)
  
  # Compute the probability of observing the main balanced accuracy given a null Gaussian distribution with the above parameters
  p_value <- pnorm(q=observed_balanced_accuracy_value, mean=null_mean, sd=null_SD, lower.tail = FALSE)
  
  # Organize results into dataframe to return
  observed_balanced_accuracy_df$p_value <- p_value
  return(observed_balanced_accuracy_df)
}


Now, we will compute $p$-values for all balanced accuracy results and save the results to a feather file.

In [10]:
%%R -i study_to_disorder_df,compute_p_values,univariate_balanced_accuracy_results,univariate_null_distribution,pairwise_balanced_accuracy_results,pairwise_null_distribution,combined_univariate_balanced_accuracy_pairwise_results,combined_univariate_pairwise_null_distribution -o univariate_p_values,pairwise_p_values,combined_univariate_pairwise_p_values

# Split the balanced accuracy results by study, disorder, and analysis type
univariate_balanced_accuracy_results_split <- univariate_balanced_accuracy_results %>%
    group_by(Study, Disorder, Analysis_Type, group_var) %>%
    group_split()

pairwise_balanced_accuracy_results_split <- pairwise_balanced_accuracy_results %>%
    group_by(Study, Disorder, Analysis_Type, group_var) %>%
    group_split()

combined_univariate_balanced_accuracy_pairwise_results_split <- combined_univariate_balanced_accuracy_pairwise_results %>%
    group_by(Study, Disorder, Analysis_Type, group_var) %>%
    group_split()

univariate_p_values <- univariate_balanced_accuracy_results_split %>%
    purrr::map_df(~ compute_p_values(observed_balanced_accuracy_df = .x,
                                     null_distribution_df = univariate_null_distribution)) %>%
  # Adjust p-values by group
    group_by(Study, Disorder, Analysis_Type) %>%
    mutate(p_value_HolmBonferroni = p.adjust(p_value, method="holm"),
           p_value_BenjaminiHochberg = p.adjust(p_value, method="BH"))

pairwise_p_values <- pairwise_balanced_accuracy_results_split %>%
    purrr::map_df(~ compute_p_values(observed_balanced_accuracy_df = .x,
                                     null_distribution_df = pairwise_null_distribution)) %>%
  # Adjust p-values by group
    group_by(Study, Disorder, Analysis_Type) %>%
    mutate(p_value_HolmBonferroni = p.adjust(p_value, method="holm"),
           p_value_BenjaminiHochberg = p.adjust(p_value, method="BH"))

combined_univariate_pairwise_p_values <- combined_univariate_balanced_accuracy_pairwise_results_split %>%
    purrr::map_df(~ compute_p_values(observed_balanced_accuracy_df = .x,
                                     null_distribution_df = combined_univariate_pairwise_null_distribution)) %>%
  # Adjust p-values by group
    group_by(Study, Disorder, Analysis_Type) %>%
    mutate(p_value_HolmBonferroni = p.adjust(p_value, method="holm"),
           p_value_BenjaminiHochberg = p.adjust(p_value, method="BH"))


In [None]:
# Save p-values to feather file
univariate_p_values.reset_index().to_feather("data/classification_results/univariate_p_values.feather")
pairwise_p_values.reset_index().to_feather("data/classification_results/pairwise_p_values.feather")
combined_univariate_pairwise_p_values.reset_index().to_feather("data/classification_results/combined_univariate_pairwise_p_values.feather")

## Sensitivity analysis with different classifier types

While all previous analyses used a linear SVM classifier, we also considered the possibility that---given the spatiotemporal complexity of these neuropsychiatric disorders---cases and controls may not be linearly separable in feature space.
We therefore repeated A<sub>region</sub>, A<sub>region</sub>, and A<sub>region</sub> using SVM with a (nonlinear) radial basis function (RBF) kernel and using the random forest ensemble classifier, both from 'scikit-learn'.

In [None]:
# Run univariate classification for UCLA CNP for each disorder
for classifier_type in ["RandomForest", "RBF_SVM"]:
    for disorder in ["SCZ", "BP", "ADHD"]:
        run_univariate_classifier(dataset_ID="UCLA_CNP",
                            data_path=f"{current_path}/data/",
                            metadata=UCLA_CNP_metadata,
                            univariate_feature_data=UCLA_CNP_univariate_features,
                            univariate_first_25_PCs=univariate_combo_first25_PCs,
                            disorder=disorder,
                            univariate_feature_set=univariate_feature_set, 
                            pairwise_feature_set=pairwise_feature_set,
                            classifier_type = classifier_type,
                            num_folds = num_folds,
                            num_repeats = num_repeats,
                            num_jobs = num_jobs,
                            num_null_iters = num_null_iters)

    # Run univariate classification for ABIDE
    for disorder in ["ASD"]:
        run_univariate_classifier(dataset_ID="ABIDE",
                                data_path=f"{current_path}/data/",
                                metadata=ABIDE_metadata,
                                univariate_feature_data=ABIDE_univariate_features,
                                univariate_first_25_PCs=univariate_combo_first25_PCs,
                                disorder=disorder,
                                univariate_feature_set=univariate_feature_set, 
                                pairwise_feature_set=pairwise_feature_set,
                                classifier_type = classifier_type,
                                num_folds = num_folds,
                                num_repeats = num_repeats,
                                num_jobs = num_jobs,
                                num_null_iters = num_null_iters)

Results from these nonlinear classifiers can be tabulated as follows:

In [8]:
RBF_SVM_balanced_accuracy_results_list = []
RandomForest_balanced_accuracy_results_list = []

# Iterate over each row of study_to_disorder_df to extract the study and disorder
for index, row in study_to_disorder_df.iterrows():
    study = row['Study']
    disorder = row['Disorder']
    
    # Read in univariate results
    RBF_SVM_univariate_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_Univariate_{univariate_feature_set}_RBF_SVM_balanced_accuracy.feather")
                          .assign(Study=study, Disorder=disorder))
    RBF_SVM_balanced_accuracy_results_list.append(RBF_SVM_univariate_results.reset_index(drop=True))
    
    RandomForest_univariate_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_Univariate_{univariate_feature_set}_RandomForest_balanced_accuracy.feather")
                            .assign(Study=study, Disorder=disorder))
    RandomForest_balanced_accuracy_results_list.append(RandomForest_univariate_results.reset_index(drop=True))

# Concatenate results into one balanced accuracy dataframe
RBF_SVM_balanced_accuracy_results = pd.concat(RBF_SVM_balanced_accuracy_results_list, axis=0).reset_index()
RandomForest_balanced_accuracy_results = pd.concat(RandomForest_balanced_accuracy_results_list, axis=0).reset_index()

# Save to one feather file
RBF_SVM_balanced_accuracy_results.to_feather("data/classification_results/all_RBF_SVM_balanced_accuracy_results.feather")
RandomForest_balanced_accuracy_results.to_feather("data/classification_results/all_RandomForest_balanced_accuracy_results.feather")

# Evaluating the effect of inverse probability weighting

We will also fit linear SVM classifiers with no sample weighting to compare against those fit with inverse probability weighting.

In [None]:
# Run univariate classification for UCLA CNP for each disorder
for disorder in ["SCZ", "BP", "ADHD"]:
    run_univariate_classifier_no_weighting(dataset_ID="UCLA_CNP",
                        data_path=f"{current_path}/data/",
                        metadata=UCLA_CNP_metadata,
                        univariate_feature_data=UCLA_CNP_univariate_features,
                        univariate_first_25_PCs=univariate_combo_first25_PCs,
                        disorder=disorder,
                        univariate_feature_set=univariate_feature_set, 
                        pairwise_feature_set=pairwise_feature_set,
                        classifier_type = classifier_type,
                        num_folds = num_folds,
                        num_repeats = num_repeats,
                        num_jobs = num_jobs,
                        num_null_iters = num_null_iters)

# Run univariate classification for ABIDE
for disorder in ["ASD"]:
    run_univariate_classifier_no_weighting(dataset_ID="ABIDE",
                            data_path=f"{current_path}/data/",
                            metadata=ABIDE_metadata,
                            univariate_feature_data=ABIDE_univariate_features,
                            univariate_first_25_PCs=univariate_combo_first25_PCs,
                            disorder=disorder,
                            univariate_feature_set=univariate_feature_set, 
                            pairwise_feature_set=pairwise_feature_set,
                            classifier_type = classifier_type,
                            num_folds = num_folds,
                            num_repeats = num_repeats,
                            num_jobs = num_jobs,
                            num_null_iters = num_null_iters)

In [7]:
Linear_SVM_no_weighting_balanced_accuracy_list = []

# Iterate over each row of study_to_disorder_df to extract the study and disorder
for index, row in study_to_disorder_df.iterrows():
    study = row['Study']
    disorder = row['Disorder']
    
    # Read in univariate results
    Linear_SVM_no_weighting_univariate_results = (pd.read_feather(f"data/classification_results/balanced_accuracy/{study}_{disorder}_Univariate_{univariate_feature_set}_Linear_SVM_no_weighting_balanced_accuracy.feather")
                          .assign(Study=study, Disorder=disorder))
    Linear_SVM_no_weighting_balanced_accuracy_list.append(Linear_SVM_no_weighting_univariate_results.reset_index(drop=True))
    

# Concatenate results into one balanced accuracy dataframe
Linear_SVM_no_weighting_balanced_accuracy = pd.concat(Linear_SVM_no_weighting_balanced_accuracy_list, axis=0).reset_index()

# Save to one feather file
Linear_SVM_no_weighting_balanced_accuracy.to_feather("data/classification_results/all_linear_SVM_no_weighting_balanced_accuracy_results.feather")