In [1]:
from junifer.storage import HDF5FeatureStorage
from julearn.api import run_cross_validation
from julearn.pipeline import PipelineCreator
from julearn.viz import plot_scores
from julearn.stats.corrected_ttest import corrected_ttest
import pandas as pd
import seaborn as sns
from sklearn.svm import LinearSVC
import warnings
import numpy as np


It is highly recommended to configure Git before using DataLad. Set both 'user.name' and 'user.email' configuration variables.


In [2]:
storage = HDF5FeatureStorage(uri='./data/AOMIC_Parcels_sch1000x7.hdf5')

In [3]:
df_parcellations = storage.read_df('VBM_GM_Schaefer1000x7_mean_aggregation')
df_histograms= storage.read_df('VBM_GM_Histogram_1000bins_AOMIC_hist')
df_demographics = pd.read_csv('./data/participants.tsv',sep='\t')
df_demographics.rename(columns={"participant_id": "subject"}, inplace=True)

In [5]:
df_parcellations = df_parcellations.replace(np.nan,0)
df_histograms = df_histograms.replace(np.nan,0)

In [6]:
df_parcellations.columns = df_parcellations.columns.astype(str)
df_histograms.columns = df_histograms.columns.astype(str)

# List of columns for each dataframe
X_parcellations = list(df_parcellations.columns)
X_histograms = list(df_histograms.columns)

# Merge with df_demographics on 'subject'
df_full_parcellations = df_parcellations.merge(df_demographics, on="subject")
df_full_histograms = df_histograms.merge(df_demographics, on="subject")


In [7]:
# For SVM (Support Vector Machine)
creator_svm = PipelineCreator(problem_type="classification")
creator_svm.add("zscore")
creator_svm.add(
    "svm",
    C=(0.001, 100, "log-uniform"),
)

search_params_svm = {
    "kind": "optuna",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# SVM on histograms
scores_hists_svm, model_hists_svm, inspector_hists_svm = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_svm,
    model=creator_svm,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# SVM on parcellations
scores_schaefer_svm, model_schaefer_svm, inspector_schaefer_svm = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_svm,
    model=creator_svm,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)





  warn_with_log(

  pipeline = search(  # type: ignore

  new_object = klass(**new_object_params)

[I 2024-09-27 00:19:20,813] A new study created in memory with name: no-name-cb181df3-b461-4eda-aea4-fe146f6a48f2
[I 2024-09-27 00:19:21,237] Trial 0 finished with value: 0.5251810510929507 and parameters: {'svm__C': 0.0019168269962465075}. Best is trial 0 with value: 0.5251810510929507.
[I 2024-09-27 00:19:21,605] Trial 1 finished with value: 0.6632781874958474 and parameters: {'svm__C': 0.28899549172769196}. Best is trial 1 with value: 0.6632781874958474.
[I 2024-09-27 00:19:21,978] Trial 2 finished with value: 0.68487974220982 and parameters: {'svm__C': 0.5029273581662591}. Best is trial 2 with value: 0.68487974220982.
[I 2024-09-27 00:19:22,387] Trial 3 finished with value: 0.7266294598365557 and parameters: {'svm__C': 81.06046752853517}. Best is trial 3 with value: 0.7266294598365557.
[I 2024-09-27 00:19:22,798] Trial 4 finished with value: 0.6388279848515048 and parameters: {'svm__C

In [8]:
scores_schaefer_svm

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,4.035377,0.050522,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.90061,1.0,0.900862,1.0,695,232,0,0,bc7087515161a73a5a6aff57863f3803
1,3.54131,0.051041,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.888748,1.0,0.887931,1.0,695,232,0,1,bc7087515161a73a5a6aff57863f3803
2,3.796668,0.048857,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.905961,1.0,0.900862,1.0,695,232,0,2,bc7087515161a73a5a6aff57863f3803
3,3.625155,0.049507,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.894535,1.0,0.896104,1.0,696,231,0,3,bc7087515161a73a5a6aff57863f3803


In [9]:
scores_hists_svm

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,4.069626,0.055972,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.732084,0.880116,0.732759,0.882014,695,232,0,0,bc7087515161a73a5a6aff57863f3803
1,3.488638,0.056792,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.716889,1.0,0.715517,1.0,695,232,0,1,bc7087515161a73a5a6aff57863f3803
2,3.813426,0.055154,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.712915,1.0,0.711207,1.0,695,232,0,2,bc7087515161a73a5a6aff57863f3803
3,3.601961,0.052092,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.711382,1.0,0.714286,1.0,696,231,0,3,bc7087515161a73a5a6aff57863f3803


In [10]:
# For Random Forest
creator_rf = PipelineCreator(problem_type="classification")
creator_rf.add("zscore")
creator_rf.add(
    "rf",
    max_depth=4,
    n_estimators=100,
    
)

search_params_rf = {
    "kind": "grid",
    "cv": 4
}

# Random Forest on histograms
scores_hists_rf, model_hists_rf, inspector_hists_rf = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_rf,
    model=creator_rf,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Random Forest on parcellations
scores_schaefer_rf, model_schaefer_rf, inspector_schaefer_rf = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_rf,
    model=creator_rf,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

  warn_with_log(

  warn_with_log(



In [11]:
scores_schaefer_rf

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.521089,0.010738,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.791853,0.90247,0.793103,0.905036,695,232,0,0,bc7087515161a73a5a6aff57863f3803
1,0.527003,0.010851,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.799844,0.897214,0.797414,0.900719,695,232,0,1,bc7087515161a73a5a6aff57863f3803
2,0.523056,0.010754,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.765222,0.907128,0.771552,0.907914,695,232,0,2,bc7087515161a73a5a6aff57863f3803
3,0.524301,0.010853,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.771793,0.899008,0.779221,0.900862,696,231,0,3,bc7087515161a73a5a6aff57863f3803


In [12]:
scores_hists_rf

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.463581,0.011112,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.688076,0.803923,0.689655,0.810072,695,232,0,0,bc7087515161a73a5a6aff57863f3803
1,0.468164,0.012087,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.598572,0.819085,0.594828,0.825899,695,232,0,1,bc7087515161a73a5a6aff57863f3803
2,0.460758,0.010907,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.689546,0.822837,0.698276,0.82446,695,232,0,2,bc7087515161a73a5a6aff57863f3803
3,0.456462,0.010723,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.691283,0.810119,0.701299,0.814655,696,231,0,3,bc7087515161a73a5a6aff57863f3803


In [13]:
# For Extra Trees Classifier
creator_et = PipelineCreator(problem_type="classification")
creator_et.add("zscore")
creator_et.add(
    "et",
    max_depth=5,
    n_estimators=100,
)

search_params_et = {
    "kind": "grid",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# Extra Trees on histograms
scores_hists_et, model_hists_et, inspector_hists_et = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_et,
    model=creator_et,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Extra Trees on parcellations
scores_schaefer_et, model_schaefer_et, inspector_schaefer_et = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_et,
    model=creator_et,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)


  warn_with_log(

  warn_with_log(



In [14]:
scores_schaefer_et

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.118015,0.010876,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.726509,0.886301,0.728448,0.890647,695,232,0,0,bc7087515161a73a5a6aff57863f3803
1,0.116174,0.010819,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.741913,0.88131,0.737069,0.88777,695,232,0,1,bc7087515161a73a5a6aff57863f3803
2,0.116042,0.010854,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.76906,0.909791,0.780172,0.910791,695,232,0,2,bc7087515161a73a5a6aff57863f3803
3,0.115903,0.010838,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.766034,0.88244,0.774892,0.885057,696,231,0,3,bc7087515161a73a5a6aff57863f3803


In [15]:
scores_hists_et

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.117391,0.011095,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.674918,0.785949,0.676724,0.794245,695,232,0,0,bc7087515161a73a5a6aff57863f3803
1,0.114787,0.010847,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.625567,0.800291,0.62069,0.810072,695,232,0,1,bc7087515161a73a5a6aff57863f3803
2,0.114514,0.010816,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.68853,0.800143,0.702586,0.802878,695,232,0,2,bc7087515161a73a5a6aff57863f3803
3,0.114413,0.010881,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.675136,0.794742,0.688312,0.800287,696,231,0,3,bc7087515161a73a5a6aff57863f3803


In [16]:
# For Gradient Boosting Classifier
creator_gb = PipelineCreator(problem_type="classification")
creator_gb.add("zscore")
creator_gb.add(
    "gradientboost",
    learning_rate = 0.02,
)

search_params_gb = {
    "kind": "grid",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# Gradient Boosting on histograms
scores_hists_gb, model_hists_gb, inspector_hists_gb = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_gb,
    model=creator_gb,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Gradient Boosting on parcellations
scores_schaefer_gb, model_schaefer_gb, inspector_schaefer_gb = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_gb,
    model=creator_gb,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)


  warn_with_log(

  warn_with_log(



In [17]:
scores_schaefer_gb

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,13.918181,0.010311,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.827535,0.953694,0.827586,0.953957,695,232,0,0,bc7087515161a73a5a6aff57863f3803
1,14.030294,0.009397,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.794527,0.942432,0.793103,0.943885,695,232,0,1,bc7087515161a73a5a6aff57863f3803
2,14.048417,0.010461,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.784639,0.953833,0.788793,0.953957,695,232,0,2,bc7087515161a73a5a6aff57863f3803
3,13.944039,0.009134,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.778117,0.953571,0.78355,0.954023,696,231,0,3,bc7087515161a73a5a6aff57863f3803


In [18]:
scores_hists_gb

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,12.960234,0.010363,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.705471,0.872893,0.706897,0.876259,695,232,0,0,bc7087515161a73a5a6aff57863f3803
1,12.375866,0.009247,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.588607,0.876383,0.586207,0.879137,695,232,0,1,bc7087515161a73a5a6aff57863f3803
2,12.445215,0.009165,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.688643,0.885833,0.689655,0.886331,695,232,0,2,bc7087515161a73a5a6aff57863f3803
3,12.55488,0.010314,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.710253,0.874405,0.714286,0.876437,696,231,0,3,bc7087515161a73a5a6aff57863f3803


In [19]:
scores_schaefer_svm['model'] = 'AOMIC_Schaefer_SVM'
scores_hists_svm['model'] = 'AOMIC_Histograms_SVM'

scores_schaefer_rf['model'] = 'AOMIC_Schaefer_RF'
scores_hists_rf['model'] = 'AOMIC_Histograms_RF'

scores_schaefer_et['model'] = 'AOMIC_Schaefer_ET'
scores_hists_et['model'] = 'AOMIC_Histograms_ET'

scores_schaefer_gb['model'] = 'AOMIC_Schaefer_GB'
scores_hists_gb['model'] = 'AOMIC_Histograms_GB'

#scores_schaefer_xgb['model'] = 'AOMIC_Schaefer_XGB'
#scores_hists_xgb['model'] = 'AOMIC_Histograms_XGB'


In [21]:
plot_scores(scores_schaefer_svm,scores_hists_svm,scores_schaefer_rf,scores_hists_rf,scores_schaefer_et,scores_hists_et,scores_schaefer_gb,scores_hists_gb)#,scores_schaefer_xgb,scores_hists_xgb)

BokehModel(combine_events=True, render_bundle={'docs_json': {'2743518e-424a-4bbe-8aad-8e596c60a5f8': {'version…