In [1]:
from junifer.storage import HDF5FeatureStorage
from julearn.api import run_cross_validation
from julearn.pipeline import PipelineCreator
from julearn.viz import plot_scores
from julearn.stats.corrected_ttest import corrected_ttest
import pandas as pd
import seaborn as sns
from sklearn.svm import LinearSVC
import warnings
import numpy as np


It is highly recommended to configure Git before using DataLad. Set both 'user.name' and 'user.email' configuration variables.


In [2]:
storage = HDF5FeatureStorage(uri='./data/AOMIC_Parcels_sch1000x7.hdf5')

In [3]:
df_parcellations = storage.read_df('VBM_GM_Schaefer1000x7_mean_aggregation')
df_histograms= storage.read_df('VBM_GM_Histogram_1000bins_AOMIC_hist')
df_demographics = pd.read_csv('./data/participants.tsv',sep='\t')
df_demographics.rename(columns={"participant_id": "subject"}, inplace=True)

In [4]:
df_parcellations.dropna(inplace=True)
df_parcellations

df_histograms.dropna(inplace=True)

In [5]:
#df_parcellations = df_parcellations.replace(np.nan,0)
#df_histograms = df_histograms.replace(np.nan,0)

In [6]:
df_parcellations.columns = df_parcellations.columns.astype(str)
df_histograms.columns = df_histograms.columns.astype(str)

# List of columns for each dataframe
X_parcellations = list(df_parcellations.columns)
X_histograms = list(df_histograms.columns)

# Merge with df_demographics on 'subject'
df_full_parcellations = df_parcellations.merge(df_demographics, on="subject")
df_full_histograms = df_histograms.merge(df_demographics, on="subject")


In [7]:
# For SVM (Support Vector Machine)
creator_svm = PipelineCreator(problem_type="classification")
creator_svm.add("zscore")
creator_svm.add(
    "svm",
    C=(0.001, 100, "log-uniform"),
)

search_params_svm = {
    "kind": "optuna",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# SVM on histograms
scores_hists_svm, model_hists_svm, inspector_hists_svm = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_svm,
    model=creator_svm,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# SVM on parcellations
scores_schaefer_svm, model_schaefer_svm, inspector_schaefer_svm = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_svm,
    model=creator_svm,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)





  warn_with_log(

  pipeline = search(  # type: ignore

  new_object = klass(**new_object_params)

[I 2024-10-08 10:10:12,603] A new study created in memory with name: no-name-7d77528b-f469-4e15-8597-0964bbeb3d34
[I 2024-10-08 10:10:13,026] Trial 0 finished with value: 0.6551724137931034 and parameters: {'svm__C': 0.14373283462063025}. Best is trial 0 with value: 0.6551724137931034.
[I 2024-10-08 10:10:13,452] Trial 1 finished with value: 0.6652298850574714 and parameters: {'svm__C': 0.284421279273977}. Best is trial 1 with value: 0.6652298850574714.
[I 2024-10-08 10:10:13,943] Trial 2 finished with value: 0.6522988505747127 and parameters: {'svm__C': 0.09519912371384194}. Best is trial 1 with value: 0.6652298850574714.
[I 2024-10-08 10:10:14,342] Trial 3 finished with value: 0.7270114942528736 and parameters: {'svm__C': 73.10752396384675}. Best is trial 3 with value: 0.7270114942528736.
[I 2024-10-08 10:10:14,735] Trial 4 finished with value: 0.7298850574712643 and parameters: {'svm__

In [8]:
scores_schaefer_svm

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,3.939487,0.057238,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.892135,1.0,0.892241,1.0,696,232,0,0,bc7087515161a73a5a6aff57863f3803
1,3.62483,0.057767,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.897375,1.0,0.896552,1.0,696,232,0,1,bc7087515161a73a5a6aff57863f3803
2,4.069512,0.056297,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.901106,1.0,0.896552,1.0,696,232,0,2,bc7087515161a73a5a6aff57863f3803
3,3.580998,0.056497,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.89002,1.0,0.892241,1.0,696,232,0,3,bc7087515161a73a5a6aff57863f3803


In [9]:
scores_hists_svm

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,4.096169,0.056149,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.728293,0.947589,0.728448,0.948276,696,232,0,0,bc7087515161a73a5a6aff57863f3803
1,3.553833,0.048531,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.739012,0.986743,0.737069,0.987069,696,232,0,1,bc7087515161a73a5a6aff57863f3803
2,3.684307,0.047713,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.748777,0.894514,0.75,0.895115,696,232,0,2,bc7087515161a73a5a6aff57863f3803
3,3.672113,0.05391,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.707802,0.998512,0.711207,0.998563,696,232,0,3,bc7087515161a73a5a6aff57863f3803


In [10]:
# For Random Forest
creator_rf = PipelineCreator(problem_type="classification")
creator_rf.add("zscore")
creator_rf.add(
    "rf",
    max_depth=4,
    n_estimators=100,
    
)

search_params_rf = {
    "kind": "grid",
    "cv": 4
}

# Random Forest on histograms
scores_hists_rf, model_hists_rf, inspector_hists_rf = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_rf,
    model=creator_rf,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Random Forest on parcellations
scores_schaefer_rf, model_schaefer_rf, inspector_schaefer_rf = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_rf,
    model=creator_rf,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

  warn_with_log(

  warn_with_log(



In [11]:
scores_schaefer_rf

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.531311,0.011133,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.783081,0.898444,0.784483,0.900862,696,232,0,0,bc7087515161a73a5a6aff57863f3803
1,0.527701,0.010594,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.783483,0.893658,0.780172,0.897989,696,232,0,1,bc7087515161a73a5a6aff57863f3803
2,0.528194,0.01071,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.762286,0.913173,0.771552,0.913793,696,232,0,2,bc7087515161a73a5a6aff57863f3803
3,0.529645,0.010744,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.787126,0.907738,0.793103,0.909483,696,232,0,3,bc7087515161a73a5a6aff57863f3803


In [12]:
scores_hists_rf

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.45926,0.010973,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.687927,0.810586,0.689655,0.816092,696,232,0,0,bc7087515161a73a5a6aff57863f3803
1,0.459645,0.01072,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.615825,0.811051,0.612069,0.817529,696,232,0,1,bc7087515161a73a5a6aff57863f3803
2,0.461354,0.010824,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.693422,0.815963,0.702586,0.817529,696,232,0,2,bc7087515161a73a5a6aff57863f3803
3,0.459103,0.010699,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.679123,0.808631,0.689655,0.813218,696,232,0,3,bc7087515161a73a5a6aff57863f3803


In [13]:
# For Extra Trees Classifier
creator_et = PipelineCreator(problem_type="classification")
creator_et.add("zscore")
creator_et.add(
    "et",
    max_depth=5,
    n_estimators=100,
)

search_params_et = {
    "kind": "grid",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# Extra Trees on histograms
scores_hists_et, model_hists_et, inspector_hists_et = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_et,
    model=creator_et,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Extra Trees on parcellations
scores_schaefer_et, model_schaefer_et, inspector_schaefer_et = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_et,
    model=creator_et,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)


  warn_with_log(

  warn_with_log(



In [14]:
scores_schaefer_et

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.131489,0.011641,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.769477,0.88165,0.771552,0.886494,696,232,0,0,bc7087515161a73a5a6aff57863f3803
1,0.121942,0.011048,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.742136,0.871091,0.737069,0.877874,696,232,0,1,bc7087515161a73a5a6aff57863f3803
2,0.114683,0.010688,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.763227,0.890846,0.775862,0.892241,696,232,0,2,bc7087515161a73a5a6aff57863f3803
3,0.11512,0.010656,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.739688,0.881944,0.75,0.885057,696,232,0,3,bc7087515161a73a5a6aff57863f3803


In [15]:
scores_hists_et

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.11944,0.011113,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.670384,0.77514,0.672414,0.783046,696,232,0,0,bc7087515161a73a5a6aff57863f3803
1,0.115075,0.010786,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.599911,0.811018,0.594828,0.820402,696,232,0,1,bc7087515161a73a5a6aff57863f3803
2,0.114763,0.010995,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.695341,0.809223,0.706897,0.811782,696,232,0,2,bc7087515161a73a5a6aff57863f3803
3,0.117492,0.010916,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.673491,0.791766,0.685345,0.797414,696,232,0,3,bc7087515161a73a5a6aff57863f3803


In [16]:
# For Gradient Boosting Classifier
creator_gb = PipelineCreator(problem_type="classification")
creator_gb.add("zscore")
creator_gb.add(
    "gradientboost",
    learning_rate = 0.02,
)

search_params_gb = {
    "kind": "grid",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# Gradient Boosting on histograms
scores_hists_gb, model_hists_gb, inspector_hists_gb = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_gb,
    model=creator_gb,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Gradient Boosting on parcellations
scores_schaefer_gb, model_schaefer_gb, inspector_schaefer_gb = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_gb,
    model=creator_gb,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)


  warn_with_log(

  warn_with_log(



In [17]:
scores_schaefer_gb

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,13.956561,0.009635,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.814674,0.95472,0.814655,0.95546,696,232,0,0,bc7087515161a73a5a6aff57863f3803
1,13.977751,0.011185,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.829256,0.948508,0.827586,0.949713,696,232,0,1,bc7087515161a73a5a6aff57863f3803
2,13.950534,0.009124,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.805976,0.9523,0.810345,0.952586,696,232,0,2,bc7087515161a73a5a6aff57863f3803
3,14.042395,0.011276,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.795256,0.959425,0.801724,0.95977,696,232,0,3,bc7087515161a73a5a6aff57863f3803


In [18]:
scores_hists_gb

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,12.306613,0.010894,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.727104,0.877577,0.728448,0.880747,696,232,0,0,bc7087515161a73a5a6aff57863f3803
1,12.391035,0.010731,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.615156,0.87032,0.612069,0.873563,696,232,0,1,bc7087515161a73a5a6aff57863f3803
2,12.339492,0.009117,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.701249,0.883115,0.702586,0.883621,696,232,0,2,bc7087515161a73a5a6aff57863f3803
3,12.402473,0.010682,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.702693,0.874405,0.706897,0.876437,696,232,0,3,bc7087515161a73a5a6aff57863f3803


In [19]:
scores_hists_xgb = pd.read_csv('XGB_scores_hists.csv')
scores_schaefer_xgb = pd.read_csv('XGB_scores_shaefer.csv')

In [20]:
scores_schaefer_xgb

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,2.612153,0.119446,"Pipeline(steps=[('set_column_types', SetColumn...",0.84493,0.99013,0.844828,0.989943,696,232,0,0,bc7087515161a73a5a6aff57863f3803
1,2.498094,0.118337,"Pipeline(steps=[('set_column_types', SetColumn...",0.833457,0.987291,0.831897,0.987069,696,232,0,1,bc7087515161a73a5a6aff57863f3803
2,2.517852,0.118061,"Pipeline(steps=[('set_column_types', SetColumn...",0.81956,0.984116,0.823276,0.984195,696,232,0,2,bc7087515161a73a5a6aff57863f3803
3,2.537237,0.117552,"Pipeline(steps=[('set_column_types', SetColumn...",0.818192,0.991369,0.823276,0.991379,696,232,0,3,bc7087515161a73a5a6aff57863f3803


In [21]:
scores_hists_xgb

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,2.108707,0.117843,"Pipeline(steps=[('set_column_types', SetColumn...",0.74435,0.932765,0.74569,0.933908,696,232,0,0,bc7087515161a73a5a6aff57863f3803
1,1.973933,0.121224,"Pipeline(steps=[('set_column_types', SetColumn...",0.631963,0.916772,0.62931,0.918103,696,232,0,1,bc7087515161a73a5a6aff57863f3803
2,1.988173,0.118174,"Pipeline(steps=[('set_column_types', SetColumn...",0.685745,0.948062,0.685345,0.948276,696,232,0,2,bc7087515161a73a5a6aff57863f3803
3,1.966928,0.118275,"Pipeline(steps=[('set_column_types', SetColumn...",0.703215,0.931845,0.706897,0.932471,696,232,0,3,bc7087515161a73a5a6aff57863f3803


In [22]:
scores_schaefer_svm['model'] = 'AOMIC_Schaefer_SVM'
scores_hists_svm['model'] = 'AOMIC_Histograms_SVM'

scores_schaefer_rf['model'] = 'AOMIC_Schaefer_RF'
scores_hists_rf['model'] = 'AOMIC_Histograms_RF'

scores_schaefer_et['model'] = 'AOMIC_Schaefer_ET'
scores_hists_et['model'] = 'AOMIC_Histograms_ET'

scores_schaefer_gb['model'] = 'AOMIC_Schaefer_GB'
scores_hists_gb['model'] = 'AOMIC_Histograms_GB'

scores_schaefer_xgb['model'] = 'AOMIC_Schaefer_XGB'
scores_hists_xgb['model'] = 'AOMIC_Histograms_XGB'


In [23]:
plot_scores(scores_schaefer_svm,scores_hists_svm,scores_schaefer_rf,scores_hists_rf,scores_schaefer_et,scores_hists_et,scores_schaefer_gb,scores_hists_gb,scores_schaefer_xgb,scores_hists_xgb)

BokehModel(combine_events=True, render_bundle={'docs_json': {'b2e2f58b-90dd-4c80-a5a0-6b91ee3cc568': {'version…