In [2]:
from junifer.storage import HDF5FeatureStorage
from julearn.api import run_cross_validation
from julearn.pipeline import PipelineCreator
from julearn.viz import plot_scores
from julearn.stats.corrected_ttest import corrected_ttest
import pandas as pd
import seaborn as sns
from sklearn.svm import LinearSVC
import warnings


It is highly recommended to configure Git before using DataLad. Set both 'user.name' and 'user.email' configuration variables.


In [3]:
storage = HDF5FeatureStorage(uri='./data/AOMIC_Parcels_sch100x7.hdf5')

In [4]:
df_parcellations = storage.read_df('VBM_GM_Schaefer100x7_mean_aggregation')
df_histograms= storage.read_df('VBM_GM_Histogram_100bins_IXI_hist')
df_demographics = pd.read_csv('./data/participants.tsv',sep='\t')
df_demographics.rename(columns={"participant_id": "subject"}, inplace=True)

In [5]:
df_parcellations.dropna(inplace=True)
df_histograms.dropna(inplace=True)


In [6]:
df_parcellations.columns = df_parcellations.columns.astype(str)
df_histograms.columns = df_histograms.columns.astype(str)

# List of columns for each dataframe
X_parcellations = list(df_parcellations.columns)
X_histograms = list(df_histograms.columns)

# Merge with df_demographics on 'subject'
df_full_parcellations = df_parcellations.merge(df_demographics, on="subject")
df_full_histograms = df_histograms.merge(df_demographics, on="subject")


In [11]:
# For SVM (Support Vector Machine)
creator_svm = PipelineCreator(problem_type="classification")
creator_svm.add("zscore")
creator_svm.add(
    "svm",
    C=(0.001, 100, "log-uniform"),
)

search_params_svm = {
    "kind": "optuna",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# SVM on histograms
scores_hists_svm, model_hists_svm, inspector_hists_svm = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_svm,
    model=creator_svm,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# SVM on parcellations
scores_schaefer_svm, model_schaefer_svm, inspector_schaefer_svm = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_svm,
    model=creator_svm,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)





  warn_with_log(

  pipeline = search(  # type: ignore

  new_object = klass(**new_object_params)

[I 2024-09-24 17:52:53,594] A new study created in memory with name: no-name-5f099ac3-e0fc-4e66-8be8-f038b2f4636b
[I 2024-09-24 17:52:53,672] Trial 0 finished with value: 0.6849112426035503 and parameters: {'svm__C': 1.8628545436679005}. Best is trial 0 with value: 0.6849112426035503.
[I 2024-09-24 17:52:53,751] Trial 1 finished with value: 0.71301775147929 and parameters: {'svm__C': 16.88950378390508}. Best is trial 1 with value: 0.71301775147929.
[I 2024-09-24 17:52:53,945] Trial 2 finished with value: 0.7026627218934911 and parameters: {'svm__C': 21.866624163645664}. Best is trial 1 with value: 0.71301775147929.
[I 2024-09-24 17:52:54,031] Trial 3 finished with value: 0.6168639053254438 and parameters: {'svm__C': 0.02626520794101087}. Best is trial 1 with value: 0.71301775147929.
[I 2024-09-24 17:52:54,109] Trial 4 finished with value: 0.6479289940828402 and parameters: {'svm__C': 0.29

In [12]:
scores_schaefer_svm

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.918239,0.009384,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.840558,0.960547,0.836283,0.960059,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.900648,0.009733,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.78379,0.907967,0.778761,0.908284,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.938295,0.009372,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.827519,0.995583,0.826667,0.995569,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.909972,0.009176,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.79089,0.95796,0.791111,0.957164,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [13]:
scores_hists_svm

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.97537,0.009428,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.74669,0.846556,0.743363,0.847633,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.842501,0.01006,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.710516,0.751374,0.699115,0.760355,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.885955,0.009018,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.732679,0.854739,0.737778,0.855244,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.802027,0.009434,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.675174,0.786308,0.675556,0.790251,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [8]:
# For Random Forest
creator_rf = PipelineCreator(problem_type="classification")
creator_rf.add("zscore")
creator_rf.add(
    "rf",
)

search_params_rf = {
    "kind": "optuna",
    "cv": 4
}

# Random Forest on histograms
scores_hists_rf, model_hists_rf, inspector_hists_rf = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_rf,
    model=creator_rf,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Random Forest on parcellations
scores_schaefer_rf, model_schaefer_rf, inspector_schaefer_rf = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_rf,
    model=creator_rf,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

  warn_with_log(

  warn_with_log(



In [10]:
scores_schaefer_rf

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.342736,0.006852,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.764344,1.0,0.765487,1.0,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.343481,0.006642,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.766983,1.0,0.761062,1.0,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.342093,0.006731,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.788517,1.0,0.791111,1.0,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.348962,0.006687,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.737555,1.0,0.737778,1.0,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [9]:
scores_hists_rf

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.462801,0.007932,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.686318,1.0,0.685841,1.0,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.331367,0.006728,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.696497,1.0,0.685841,1.0,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.335238,0.007177,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.680717,1.0,0.688889,1.0,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.331795,0.006803,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.630887,1.0,0.631111,1.0,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [15]:
# For Extra Trees Classifier
creator_et = PipelineCreator(problem_type="classification")
creator_et.add("zscore")
creator_et.add(
    "et",
)

search_params_et = {
    "kind": "optuna",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# Extra Trees on histograms
scores_hists_et, model_hists_et, inspector_hists_et = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_et,
    model=creator_et,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Extra Trees on parcellations
scores_schaefer_et, model_schaefer_et, inspector_schaefer_et = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_et,
    model=creator_et,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)


  warn_with_log(

  warn_with_log(



In [16]:
scores_schaefer_et

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.114881,0.007556,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.79587,1.0,0.79646,1.0,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.120438,0.007795,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.743187,1.0,0.734513,1.0,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.11442,0.007344,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.798813,1.0,0.804444,1.0,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.113724,0.007376,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.71958,1.0,0.72,1.0,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [17]:
scores_hists_et

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.130394,0.009151,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.687579,1.0,0.690265,1.0,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.121063,0.007431,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.696969,1.0,0.685841,1.0,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.12092,0.00744,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.706638,1.0,0.715556,1.0,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.120398,0.007735,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.639697,1.0,0.64,1.0,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [25]:
# For Gradient Boosting Classifier
creator_gb = PipelineCreator(problem_type="classification")
creator_gb.add("zscore")
creator_gb.add(
    "gradientboost",
)

search_params_gb = {
    "kind": "optuna",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# Gradient Boosting on histograms
scores_hists_gb, model_hists_gb, inspector_hists_gb = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_gb,
    model=creator_gb,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Gradient Boosting on parcellations
scores_schaefer_gb, model_schaefer_gb, inspector_schaefer_gb = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_gb,
    model=creator_gb,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)


  warn_with_log(

  warn_with_log(



In [26]:
scores_schaefer_gb

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,1.432737,0.004237,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.837721,0.997038,0.836283,0.997041,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,1.423954,0.005536,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.791251,1.0,0.787611,1.0,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,1.446053,0.004122,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.840359,1.0,0.844444,1.0,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,1.44065,0.003774,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.764302,0.998433,0.764444,0.998523,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [27]:
scores_hists_gb

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,1.396482,0.006703,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.666535,0.980508,0.663717,0.980769,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,1.388387,0.003819,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.701602,0.981227,0.69469,0.982249,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,1.370233,0.005115,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.667636,0.979227,0.68,0.979321,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,1.37194,0.004955,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.644161,0.979965,0.644444,0.980798,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [29]:
scores_schaefer_svm['model'] = 'AOMIC_Schaefer_SVM'
scores_hists_svm['model'] = 'AOMIC_Histograms_SVM'

scores_schaefer_rf['model'] = 'AOMIC_Schaefer_RF'
scores_hists_rf['model'] = 'AOMIC_Histograms_RF'

scores_schaefer_et['model'] = 'AOMIC_Schaefer_ET'
scores_hists_et['model'] = 'AOMIC_Histograms_ET'

scores_schaefer_gb['model'] = 'AOMIC_Schaefer_GB'
scores_hists_gb['model'] = 'AOMIC_Histograms_GB'


In [30]:
plot_scores(scores_schaefer_svm,scores_hists_svm,scores_schaefer_rf,scores_hists_rf,scores_schaefer_et,scores_hists_et,scores_schaefer_gb,scores_hists_gb)

BokehModel(combine_events=True, render_bundle={'docs_json': {'58039fcd-7bda-4ff9-841f-626be0ffd843': {'version…