In [1]:

import os
import sys
import numpy as np
import nibabel as nib
import pandas as pd
from sklearn import svm
from sklearn.pipeline import Pipeline
import os.path
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate, permutation_test_score, StratifiedKFold, GridSearchCV, RepeatedKFold
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import balanced_accuracy_score,make_scorer
from copy import deepcopy
import argparse

from core_classification_functions import *
current_path = os.getcwd()
from mixed_sigmoid_normalisation import MixedSigmoidScaler

data_path="/Users/abry4213/data/fMRI_classification/"

%load_ext rpy2.ipython

In [2]:
%%R
# Load tidyverse R package
suppressPackageStartupMessages({
    library(tidyverse)
    library(see)
    library(cowplot)
    theme_set(theme_cowplot())
})

package ‘see’ was built under R version 4.3.3 


In [3]:
# Load metadata
UCLA_CNP_subjects_to_keep = pd.read_feather(f"{data_path}/time_series_features/UCLA_CNP_filtered_sample_info_catch25_pyspi14.feather")

UCLA_CNP_metadata = (pd.read_feather(f"{data_path}/input_data/UCLA_CNP_sample_metadata.feather")
                        .assign(Study = "UCLA_CNP")
                        .query("Sample_ID in @UCLA_CNP_subjects_to_keep.Sample_ID"))

# Load head movement 
UCLA_CNP_head_mvmt = (pd.read_table(f"{data_path}/movement_data/UCLA_CNP_Mean_FD_Power.txt", sep=',')
                      .assign(Study = "UCLA_CNP")
                        .query("Sample_ID in @UCLA_CNP_subjects_to_keep.Sample_ID"))

# Merge metadata + head movement
merged_metadata = pd.merge(UCLA_CNP_metadata, UCLA_CNP_head_mvmt)

# Study/disorder lookup table
study_disorder_lookup = {'SCZ': 'UCLA_CNP', 
                          'BP': 'UCLA_CNP', 
                          'ADHD': 'UCLA_CNP'}

In [4]:
# Load SPI directionality information
SPI_directionality_data = pd.read_csv("SPI_Direction_Info.csv")
SPI_directionality_dict = dict(SPI_directionality_data.values)

# Read in feature info
univariate_feature_info = pd.read_csv("../../data/feature_info/univariate_feature_info.csv")
pairwise_feature_info = pd.read_csv("../../data/feature_info/pairwise_feature_info.csv")

In [5]:
num_folds = 10
num_repeats = 10
num_null_iters = 1000
RepeatedStratifiedKFold_splitter = RepeatedStratifiedKFold(n_splits=num_folds, n_repeats=num_repeats, random_state=127)

model = svm.SVC(kernel="linear", C=1, class_weight="balanced")
pipe = Pipeline([('scaler', MixedSigmoidScaler(unit_variance=True)),
                    ('model', model)])

scorers = [make_scorer(balanced_accuracy_score)]
scoring_names = ["Balanced_Accuracy"]


In [6]:
# Load pyspi14 data
dataset_ID = "UCLA_CNP"
sample_metadata = UCLA_CNP_metadata

# Load catch25 results
catch25_data = (pd.read_feather(f"{data_path}/time_series_features/{dataset_ID}_catch25_filtered.feather")
                .merge(sample_metadata)
                .query("Sample_ID in @sample_metadata.Sample_ID"))

pyspi14_data = (pd.read_feather(f"{data_path}/time_series_features/{dataset_ID}_pyspi14_filtered.feather")
            .merge(sample_metadata)
            .query("Sample_ID in @sample_metadata.Sample_ID"))

In [7]:
# Combine feature names and region names into one matrix
catch25_data["Combo_Feature"] = catch25_data.Brain_Region + "_" + catch25_data.names
combo_data = catch25_data.drop(["Brain_Region", "names"], axis=1)

# Pivot from long to wide
combo_data_wide = combo_data.pivot(index=["Sample_ID", "Diagnosis"],
                                columns = "Combo_Feature",
                                values = "values")

# Extract only the combo feature data
combo_features_only = combo_data_wide.reset_index(drop=True).to_numpy()

### Baseline: pyspi14 SPIs on their own

In [17]:
if not os.path.isfile(f"{data_path}/classification_results/pyspi14_SPI_alone_classification_res_df_all_folds.feather") or not os.path.isfile(f"{data_path}/classification_results/pyspi14_SPI_and_univariate_classification_res_df_all_folds.feather"):

    # Start with pearson correlation
    pyspi14_SPI_alone_classification_res_list = []
    pyspi14_SPI_and_univariate_classification_res_list = []

    for pyspi14_SPI in pyspi14_data.SPI.unique():

        # Subset data to SPI
        pyspi14_SPI_data = pyspi14_data.query("SPI == @pyspi14_SPI").drop(["SPI"], axis=1)

        # Find directionality of SPI
        SPI_directionality = SPI_directionality_dict[pyspi14_SPI]

        # Merge brain regions according to directionality
        if SPI_directionality == "Directed":
            pyspi14_SPI_data["region_pair"] = pyspi14_SPI_data.brain_region_from + "_" + pyspi14_SPI_data.brain_region_to
            pyspi14_SPI_data = pyspi14_SPI_data.drop(["brain_region_from", "brain_region_to"], axis=1)
        else:
            pyspi14_SPI_data_sorted = [sorted(pair) for pair in pyspi14_SPI_data[["brain_region_from", "brain_region_to"]].values.tolist()]
            pyspi14_SPI_data['region_pair'] = ['_'.join(string) for string in pyspi14_SPI_data_sorted]
            pyspi14_SPI_data = (pyspi14_SPI_data
                        .drop(["brain_region_from", "brain_region_to"], axis=1)
                        .drop_duplicates(ignore_index=True, subset=['Sample_ID', 'region_pair'])
                        )
                
        # Pivot from long to wide
        pyspi14_SPI_data_wide = pyspi14_SPI_data.pivot(index=['Sample_ID', 'Diagnosis'], columns='region_pair', values='value')

        # Impute any NaN with column mean
        pyspi14_SPI_data_imputed = pyspi14_SPI_data_wide.fillna(pyspi14_SPI_data_wide.mean())

        # Extract the class labels
        class_labels = pyspi14_SPI_data_imputed.reset_index()['Diagnosis'].to_numpy()
        sample_IDs = pyspi14_SPI_data_imputed.reset_index()['Sample_ID'].to_numpy()

        ############################# ONLY SPIS ##################################
        # Extract only the feature data
        pyspi14_SPIs_only = pyspi14_SPI_data_imputed.reset_index(drop=True).to_numpy()

        # Define analysis type
        Analysis_Type = "pyspi14_SPI"

        # Define grouping var
        grouping_var = pyspi14_SPI

        # Fit classifier
        SPI_classification_res, _, null_classification_res = run_k_fold_classifier_for_feature(feature_data = pyspi14_SPIs_only, 
                                                                                            pipe = pipe,
                                                                                            CV_splitter = RepeatedStratifiedKFold_splitter,
                                                                                            class_labels=class_labels,
                                                                                            sample_IDs = sample_IDs,
                                                                                            scorers=scorers,
                                                                                            scoring_names=scoring_names,
                                                                                            num_null_iters=0,
                                                                                            num_folds = num_folds,
                                                                                            num_repeats = num_repeats,
                                                                                            num_jobs = 2)
        
        # Assign key details to dataframes
        SPI_classification_res["Study"] =  "UCLA_CNP"
        SPI_classification_res["Disorder"] = "All"
        SPI_classification_res["Analysis_Type"] = Analysis_Type
        SPI_classification_res["group_var"] = grouping_var
        SPI_classification_res["Classifier_Type"] = "Linear_SVM"
        
        pyspi14_SPI_alone_classification_res_list.append(SPI_classification_res)


        ############################# SPIS + UNIVARIATE ##################################
        pyspi14_SPI_and_univariate_data_combined = np.concatenate([pyspi14_SPIs_only, combo_features_only], axis=1)

        # Define analysis type
        Analysis_Type = "SPI_Combo"

        # Define grouping var
        grouping_var = pyspi14_SPI

        # Fit classifier
        SPI_combo_classification_res, _, null_classification_res = run_k_fold_classifier_for_feature(feature_data = pyspi14_SPI_and_univariate_data_combined, 
                                                                                            pipe = pipe,
                                                                                            CV_splitter = RepeatedStratifiedKFold_splitter,
                                                                                            class_labels=class_labels,
                                                                                            sample_IDs = sample_IDs,
                                                                                            scorers=scorers,
                                                                                            scoring_names=scoring_names,
                                                                                            num_null_iters=0,
                                                                                            num_folds = num_folds,
                                                                                            num_repeats = num_repeats,
                                                                                            num_jobs = 2)
        
        # Assign key details to dataframes
        SPI_combo_classification_res["Study"] =  "UCLA_CNP"
        SPI_combo_classification_res["Disorder"] = "All"
        SPI_combo_classification_res["Analysis_Type"] = Analysis_Type
        SPI_combo_classification_res["group_var"] = grouping_var
        SPI_combo_classification_res["Classifier_Type"] = "Linear_SVM"
        
        pyspi14_SPI_and_univariate_classification_res_list.append(SPI_combo_classification_res)

    pyspi14_SPI_alone_classification_res_df = pd.concat(pyspi14_SPI_alone_classification_res_list).reset_index(drop=True)
    pyspi14_SPI_and_univariate_classification_res_df = pd.concat(pyspi14_SPI_and_univariate_classification_res_list).reset_index(drop=True)

    # Save the results
    pyspi14_SPI_alone_classification_res_df.to_feather(f"{data_path}/classification_results/pyspi14_SPI_alone_classification_res_df_all_folds.feather")
    pyspi14_SPI_and_univariate_classification_res_df.to_feather(f"{data_path}/classification_results/pyspi14_SPI_and_univariate_classification_res_df_all_folds.feather")

else:
    pyspi14_SPI_alone_classification_res_df = pd.read_feather(f"{data_path}/classification_results/pyspi14_SPI_alone_classification_res_df_all_folds.feather")
    pyspi14_SPI_and_univariate_classification_res_df = pd.read_feather(f"{data_path}/classification_results/pyspi14_SPI_and_univariate_classification_res_df_all_folds.feather")

# Aggregate across folds and save the results
pyspi14_SPI_alone_classification_res_df_agg = (pyspi14_SPI_alone_classification_res_df
                                                .groupby(["Study", "Disorder", "Analysis_Type", "group_var", "Classifier_Type"], as_index=False)['Balanced_Accuracy']
                                                .agg(['mean', 'std'])
                                                .reset_index()
                                                .rename(columns={"mean": "Balanced_Accuracy", "std": "Balanced_Accuracy_SD"}))

pyspi14_SPI_and_univariate_classification_res_df_agg = (pyspi14_SPI_and_univariate_classification_res_df
                                                        .groupby(["Study", "Disorder", "Analysis_Type", "group_var", "Classifier_Type"], as_index=False)['Balanced_Accuracy']
                                                .agg(['mean', 'std'])
                                                .reset_index()
                                                .rename(columns={"mean": "Balanced_Accuracy", "std": "Balanced_Accuracy_SD"}))

pyspi14_SPI_alone_classification_res_df_agg.to_feather(f"{data_path}/classification_results/pyspi14_SPI_alone_classification_res_df.feather")
pyspi14_SPI_and_univariate_classification_res_df_agg.to_feather(f"{data_path}/classification_results/pyspi14_SPI_and_univariate_classification_res_df.feather")

In [21]:
%%R -i pyspi14_SPI_alone_classification_res_df,pairwise_feature_info

pyspi14_SPI_alone_classification_res_df %>% 
        dplyr::rename("pyspi_name"="group_var") %>% 
        left_join(pairwise_feature_info, by="pyspi_name") %>%
    ungroup() %>% 
    mutate(Figure_name = fct_reorder(Figure_name, Balanced_Accuracy, .fun=mean)) %>%
    ggplot(data=., mapping=aes(y=100*Balanced_Accuracy, x=Figure_name, color=Figure_name, fill=Figure_name)) +
    coord_flip() +
    geom_violinhalf(scale="width", position = position_nudge(x=0.25), alpha=0.5)  +
    stat_summary(color="black", fun="mean", geom="crossbar", width=0.5, linewidth=0.3, 
            show.legend=FALSE, position = position_nudge(x=0.5)) +
    geom_point( position = position_jitter(width = 0.1),
                size = 1.5, alpha=0.6, stroke=0) +
    theme(legend.position="none", plot.title = element_text(hjust=0.5)) +
    xlab("SPI") +
    ylab("Balanced Accuracy (%)") +
    ggtitle("Cross-Disorder Performance by SPI")

ggsave("../../plots/cross_disorder/SPI_classification_across_disorders.svg", width=6, height=9, units="in", dpi=300)

In [22]:
%%R -i pyspi14_SPI_and_univariate_classification_res_df,pairwise_feature_info

pyspi14_SPI_and_univariate_classification_res_df %>% 
        dplyr::rename("pyspi_name"="group_var") %>% 
        left_join(pairwise_feature_info, by="pyspi_name") %>%
    ungroup() %>% 
    mutate(Figure_name = fct_reorder(Figure_name, Balanced_Accuracy, .fun=mean)) %>%
    ggplot(data=., mapping=aes(y=100*Balanced_Accuracy, x=Figure_name, color=Figure_name, fill=Figure_name)) +
    coord_flip() +
    geom_violinhalf(scale="width", position = position_nudge(x=0.25), alpha=0.5)  +
    stat_summary(color="black", fun="mean", geom="crossbar", width=0.5, linewidth=0.3, 
            show.legend=FALSE, position = position_nudge(x=0.5)) +
    geom_point( position = position_jitter(width = 0.1),
                size = 1.5, alpha=0.6, stroke=0) +
    theme(legend.position="none", plot.title = element_text(hjust=0.5)) +
    xlab("SPI") +
    ylab("Balanced Accuracy (%)") +
    ggtitle("Cross-Disorder Performance by SPI-Combo")


ggsave("../../plots/cross_disorder/SPI_combo_classification_across_disorders.svg", width=6, height=9, units="in", dpi=300)