In [1]:
from junifer.storage import HDF5FeatureStorage
from julearn.api import run_cross_validation
from julearn.pipeline import PipelineCreator
from julearn.viz import plot_scores
from julearn.stats.corrected_ttest import corrected_ttest
import pandas as pd
import seaborn as sns
from sklearn.svm import LinearSVC
import warnings

It is highly recommended to configure Git before using DataLad. Set both 'user.name' and 'user.email' configuration variables.


In [2]:
storage_parcels = HDF5FeatureStorage(uri='data/AOMICID1000_vbm_parcellations_sch100.hdf5')
storage_hists = HDF5FeatureStorage(uri='data/AOMICID1000_vbm_histogram.hdf5')

In [5]:
df_parcellations = storage_parcels.read_df('VBM_GM_Schaefer100x17_counts_aggregation')
df_histograms = storage_hists.read_df('VBM_GM_hist_hist')
df_demographics = pd.read_csv('data/participants.tsv',sep='\t')
df_demographics.rename(columns={"participant_id": "subject"}, inplace=True)

In [7]:
df_parcellations.columns = df_parcellations.columns.astype(str)
df_histograms.columns = df_histograms.columns.astype(str)

# List of columns for each dataframe
X_parcellations = list(df_parcellations.columns)
X_histograms = list(df_histograms.columns)

# Merge with df_demographics on 'subject'
df_full_parcellations = df_parcellations.merge(df_demographics, on="subject")
df_full_histograms = df_histograms.merge(df_demographics, on="subject")

# Map 'sex' column
df_full_parcellations['sex'] = df_full_parcellations['sex'].map({'F': 1, 'M': 0})
df_full_histograms['sex'] = df_full_histograms['sex'].map({'F': 1, 'M': 0})
\
# Drop rows with NaN values
df_full_parcellations = df_full_parcellations.dropna()
df_full_histograms = df_full_histograms.dropna()


In [9]:
creator = PipelineCreator(problem_type="classification")
creator.add("zscore")
creator.add(
    "svm",
    C=(0.001, 100, "log-uniform")
)

<julearn.pipeline.pipeline_creator.PipelineCreator at 0x7fa90d613b50>

In [28]:
search_params = {
    "kind": "optuna",
    "cv": 4,
    "scoring":'accuracy'

}

scoring = ["balanced_accuracy", "accuracy"]
scores_hists, model_hists, inspector_hists = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params,
    model=creator,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring = scoring,
)
scores_schaefer, model_schaefer, inspector_schaefer = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params,
    model=creator,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring = scoring,
)

  warn_with_log(

  pipeline = search(  # type: ignore

  new_object = klass(**new_object_params)

[I 2024-08-06 15:58:39,795] A new study created in memory with name: no-name-e8c5f84d-aa35-441c-8958-fc866d66d0c3
[I 2024-08-06 15:58:39,833] Trial 0 finished with value: 0.6071428571428572 and parameters: {'svm__C': 0.005765082804076596}. Best is trial 0 with value: 0.6071428571428572.
[I 2024-08-06 15:58:39,869] Trial 1 finished with value: 0.511904761904762 and parameters: {'svm__C': 42.96430419424499}. Best is trial 0 with value: 0.6071428571428572.
[I 2024-08-06 15:58:39,902] Trial 2 finished with value: 0.5178571428571429 and parameters: {'svm__C': 30.77056262078555}. Best is trial 0 with value: 0.6071428571428572.
[I 2024-08-06 15:58:39,933] Trial 3 finished with value: 0.6071428571428572 and parameters: {'svm__C': 0.2513712855257753}. Best is trial 0 with value: 0.6071428571428572.
[I 2024-08-06 15:58:39,963] Trial 4 finished with value: 0.6071428571428572 and parameters: {'svm__C

In [29]:
scores_schaefer

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.317508,0.003488,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.5,0.5,0.589286,0.565476,168,56,0,0,bc7087515161a73a5a6aff57863f3803
1,0.311848,0.003448,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.5,0.5,0.517857,0.589286,168,56,0,1,bc7087515161a73a5a6aff57863f3803
2,0.337035,0.003421,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.5,0.5,0.642857,0.547619,168,56,0,2,bc7087515161a73a5a6aff57863f3803
3,0.321642,0.003526,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.5,0.5,0.535714,0.583333,168,56,0,3,bc7087515161a73a5a6aff57863f3803


In [30]:
scores_hists

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.328854,0.003515,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.5,0.5,0.464286,0.607143,168,56,0,0,bc7087515161a73a5a6aff57863f3803
1,0.327285,0.003468,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.5,0.5,0.517857,0.589286,168,56,0,1,bc7087515161a73a5a6aff57863f3803
2,0.327572,0.003599,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.4875,0.683523,0.428571,0.678571,168,56,0,2,bc7087515161a73a5a6aff57863f3803
3,0.32445,0.003435,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.468379,0.986301,0.482143,0.988095,168,56,0,3,bc7087515161a73a5a6aff57863f3803


In [31]:
scores_hists['model'] = 'AOMIC_Histograms'
scores_schaefer['model'] = 'AOMIC_Schaefer'
plot_scores(scores_schaefer,scores_hists)

BokehModel(combine_events=True, render_bundle={'docs_json': {'6d862557-89b3-4cfb-8c0b-fb1b37dc00e5': {'version…

In [32]:
stats_df = corrected_ttest(scores_schaefer,scores_hists)
print(stats_df)

                    metric    t-stat     p-val         model_1  \
0   test_balanced_accuracy  0.966770  0.404972  AOMIC_Schaefer   
1  train_balanced_accuracy -0.955366  0.409873  AOMIC_Schaefer   
2            test_accuracy  1.385870  0.259838  AOMIC_Schaefer   
3           train_accuracy -1.038405  0.375425  AOMIC_Schaefer   

            model_2  p-val-corrected  
0  AOMIC_Histograms         0.404972  
1  AOMIC_Histograms         0.409873  
2  AOMIC_Histograms         0.259838  
3  AOMIC_Histograms         0.375425  
