In [14]:
from junifer.storage import HDF5FeatureStorage
from julearn.api import run_cross_validation
from julearn.pipeline import PipelineCreator
from julearn.viz import plot_scores
from julearn.stats.corrected_ttest import corrected_ttest
import pandas as pd
import seaborn as sns
from sklearn.svm import LinearSVC
import warnings


In [15]:
storage = HDF5FeatureStorage(uri='./data/AOMIC_Parcels_sch100x7.hdf5')

In [16]:
df_parcellations = storage.read_df('VBM_GM_Schaefer100x7_mean_aggregation')
df_histograms= storage.read_df('VBM_GM_Histogram_100bins_IXI_hist')
df_demographics = pd.read_csv('./data/participants.tsv',sep='\t')
df_demographics.rename(columns={"participant_id": "subject"}, inplace=True)

In [17]:
df_parcellations.dropna(inplace=True)
df_histograms.dropna(inplace=True)


In [18]:
df_parcellations.columns = df_parcellations.columns.astype(str)
df_histograms.columns = df_histograms.columns.astype(str)

# List of columns for each dataframe
X_parcellations = list(df_parcellations.columns)
X_histograms = list(df_histograms.columns)

# Merge with df_demographics on 'subject'
df_full_parcellations = df_parcellations.merge(df_demographics, on="subject")
df_full_histograms = df_histograms.merge(df_demographics, on="subject")


In [19]:
# For SVM (Support Vector Machine)
creator_svm = PipelineCreator(problem_type="classification")
creator_svm.add("zscore")
creator_svm.add(
    "svm",
    C=(0.001, 100, "log-uniform"),
)

search_params_svm = {
    "kind": "optuna",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# SVM on histograms
scores_hists_svm, model_hists_svm, inspector_hists_svm = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_svm,
    model=creator_svm,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# SVM on parcellations
scores_schaefer_svm, model_schaefer_svm, inspector_schaefer_svm = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_svm,
    model=creator_svm,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)





  warn_with_log(

  pipeline = search(  # type: ignore

  new_object = klass(**new_object_params)

[I 2024-09-26 02:35:03,624] A new study created in memory with name: no-name-491a4c94-3ce9-4d31-990b-b6952fe0cb5a
[I 2024-09-26 02:35:03,718] Trial 0 finished with value: 0.6198224852071006 and parameters: {'svm__C': 0.05038245494732611}. Best is trial 0 with value: 0.6198224852071006.
[I 2024-09-26 02:35:03,804] Trial 1 finished with value: 0.7085798816568047 and parameters: {'svm__C': 16.451692550763674}. Best is trial 1 with value: 0.7085798816568047.
[I 2024-09-26 02:35:03,889] Trial 2 finished with value: 0.6553254437869822 and parameters: {'svm__C': 0.5998779564994481}. Best is trial 1 with value: 0.7085798816568047.
[I 2024-09-26 02:35:03,972] Trial 3 finished with value: 0.6849112426035503 and parameters: {'svm__C': 1.724353175425233}. Best is trial 1 with value: 0.7085798816568047.
[I 2024-09-26 02:35:04,057] Trial 4 finished with value: 0.6479289940828402 and parameters: {'svm__

In [20]:
scores_schaefer_svm

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.966307,0.009171,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.844656,0.928064,0.840708,0.927515,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.791481,0.009109,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.791722,0.895604,0.787611,0.89645,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.855959,0.00975,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.845809,0.964668,0.84,0.964549,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.861752,0.008819,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.790969,0.997036,0.791111,0.997046,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [21]:
scores_hists_svm

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.910812,0.009879,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.74669,0.843594,0.743363,0.844675,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.850276,0.009914,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.744129,0.788462,0.734513,0.794379,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.870973,0.010133,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.714268,0.764122,0.728889,0.76514,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.855889,0.009235,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.666443,0.906749,0.666667,0.908419,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [22]:
# For Random Forest
creator_rf = PipelineCreator(problem_type="classification")
creator_rf.add("zscore")
creator_rf.add(
    "rf",
    max_depth=4,
    n_estimators=100,
    
)

search_params_rf = {
    "kind": "grid",
    "cv": 4
}

# Random Forest on histograms
scores_hists_rf, model_hists_rf, inspector_hists_rf = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_rf,
    model=creator_rf,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Random Forest on parcellations
scores_schaefer_rf, model_schaefer_rf, inspector_schaefer_rf = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_rf,
    model=creator_rf,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

  warn_with_log(

  warn_with_log(



In [23]:
scores_schaefer_rf

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.214936,0.006058,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.767733,0.857535,0.769912,0.859467,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.215772,0.006,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.733841,0.854396,0.725664,0.860947,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.215945,0.006087,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.744428,0.860832,0.746667,0.861152,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.222463,0.00643,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.723965,0.860751,0.724444,0.864106,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [24]:
scores_hists_rf

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.208733,0.006243,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.703263,0.796386,0.707965,0.798817,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.210787,0.007654,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.688094,0.774267,0.676991,0.784024,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.211355,0.005964,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.657098,0.775574,0.675556,0.776957,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.208493,0.006121,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.657712,0.804977,0.657778,0.809453,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [25]:
# For Extra Trees Classifier
creator_et = PipelineCreator(problem_type="classification")
creator_et.add("zscore")
creator_et.add(
    "et",
    max_depth=5,
    n_estimators=100,
)

search_params_et = {
    "kind": "grid",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# Extra Trees on histograms
scores_hists_et, model_hists_et, inspector_hists_et = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_et,
    model=creator_et,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Extra Trees on parcellations
scores_schaefer_et, model_schaefer_et, inspector_schaefer_et = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_et,
    model=creator_et,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)


  warn_with_log(

  warn_with_log(



In [26]:
scores_schaefer_et

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.069964,0.006029,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.746375,0.814722,0.752212,0.818047,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.069368,0.006024,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.671758,0.799679,0.659292,0.810651,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.069306,0.006087,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.750727,0.812948,0.76,0.813885,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.069426,0.006052,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.706068,0.831684,0.706667,0.837518,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [27]:
scores_hists_et

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.07059,0.006383,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.703815,0.736561,0.712389,0.741124,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.068737,0.006002,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.660567,0.730998,0.646018,0.745562,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.069035,0.00609,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.649104,0.742616,0.675556,0.744461,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.068738,0.0061,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.666127,0.747386,0.666667,0.757755,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [28]:
# For Gradient Boosting Classifier
creator_gb = PipelineCreator(problem_type="classification")
creator_gb.add("zscore")
creator_gb.add(
    "gradientboost",
    learning_rate = 0.02,
)

search_params_gb = {
    "kind": "grid",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# Gradient Boosting on histograms
scores_hists_gb, model_hists_gb, inspector_hists_gb = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_gb,
    model=creator_gb,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Gradient Boosting on parcellations
scores_schaefer_gb, model_schaefer_gb, inspector_schaefer_gb = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_gb,
    model=creator_gb,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)


  warn_with_log(

  warn_with_log(



In [29]:
scores_schaefer_gb

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,1.418531,0.003868,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.813682,0.915254,0.814159,0.91568,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,1.419739,0.003833,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.728226,0.91163,0.721239,0.914201,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,1.40538,0.005138,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.801357,0.906795,0.808889,0.906942,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,1.401479,0.003859,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.715194,0.888452,0.715556,0.890694,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [30]:
scores_hists_gb

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,1.360264,0.005504,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.653531,0.828978,0.650442,0.829882,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,1.358902,0.003939,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.696497,0.829899,0.685841,0.837278,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,1.345851,0.003889,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.675509,0.848617,0.684444,0.849335,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,1.372974,0.005106,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.65305,0.84157,0.653333,0.844904,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [31]:
scores_hists_xgb = pd.read_csv('XGB_scores_hists.csv')
scores_schaefer_xgb = pd.read_csv('XGB_scores_shaefer.csv')

In [34]:
scores_schaefer_xgb

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum,model
0,0.284266,0.016954,"Pipeline(steps=[('set_column_types', SetColumn...",0.832204,0.953287,0.831858,0.952663,676,226,0,0,bc7087515161a73a5a6aff57863f3803,AOMIC_Schaefer_XGB
1,0.248129,0.016681,"Pipeline(steps=[('set_column_types', SetColumn...",0.766983,0.956731,0.761062,0.957101,676,226,0,1,bc7087515161a73a5a6aff57863f3803,AOMIC_Schaefer_XGB
2,0.247478,0.016317,"Pipeline(steps=[('set_column_types', SetColumn...",0.80281,0.937924,0.804444,0.937962,677,225,0,2,bc7087515161a73a5a6aff57863f3803,AOMIC_Schaefer_XGB
3,0.250304,0.016637,"Pipeline(steps=[('set_column_types', SetColumn...",0.764143,0.937414,0.764444,0.937962,677,225,0,3,bc7087515161a73a5a6aff57863f3803,AOMIC_Schaefer_XGB


In [35]:
scores_hists_xgb

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum,model
0,0.216994,0.016667,"Pipeline(steps=[('set_column_types', SetColumn...",0.699322,0.851143,0.699115,0.852071,676,226,0,0,bc7087515161a73a5a6aff57863f3803,AOMIC_Histograms_XGB
1,0.216064,0.017305,"Pipeline(steps=[('set_column_types', SetColumn...",0.704429,0.839057,0.69469,0.844675,676,226,0,1,bc7087515161a73a5a6aff57863f3803,AOMIC_Histograms_XGB
2,0.218959,0.016462,"Pipeline(steps=[('set_column_types', SetColumn...",0.648256,0.869482,0.657778,0.870015,677,225,0,2,bc7087515161a73a5a6aff57863f3803,AOMIC_Histograms_XGB
3,0.215866,0.017488,"Pipeline(steps=[('set_column_types', SetColumn...",0.635232,0.853256,0.635556,0.856721,677,225,0,3,bc7087515161a73a5a6aff57863f3803,AOMIC_Histograms_XGB


In [32]:
scores_schaefer_svm['model'] = 'AOMIC_Schaefer_SVM'
scores_hists_svm['model'] = 'AOMIC_Histograms_SVM'

scores_schaefer_rf['model'] = 'AOMIC_Schaefer_RF'
scores_hists_rf['model'] = 'AOMIC_Histograms_RF'

scores_schaefer_et['model'] = 'AOMIC_Schaefer_ET'
scores_hists_et['model'] = 'AOMIC_Histograms_ET'

scores_schaefer_gb['model'] = 'AOMIC_Schaefer_GB'
scores_hists_gb['model'] = 'AOMIC_Histograms_GB'

scores_schaefer_xgb['model'] = 'AOMIC_Schaefer_XGB'
scores_hists_xgb['model'] = 'AOMIC_Histograms_XGB'


In [33]:
plot_scores(scores_schaefer_svm,scores_hists_svm,scores_schaefer_rf,scores_hists_rf,scores_schaefer_et,scores_hists_et,scores_schaefer_gb,scores_hists_gb,scores_schaefer_xgb,scores_hists_xgb)

BokehModel(combine_events=True, render_bundle={'docs_json': {'f432b88a-8839-45db-8d51-409bd8759c20': {'version…