In [1]:
from junifer.storage import HDF5FeatureStorage
from julearn.api import run_cross_validation
from julearn.pipeline import PipelineCreator
from julearn.viz import plot_scores
from julearn.stats.corrected_ttest import corrected_ttest
import pandas as pd
import seaborn as sns
from sklearn.svm import LinearSVC
import warnings


It is highly recommended to configure Git before using DataLad. Set both 'user.name' and 'user.email' configuration variables.


In [2]:
storage = HDF5FeatureStorage(uri='./data/AOMIC_Parcels_sch100x7.hdf5')

In [3]:
df_parcellations = storage.read_df('VBM_GM_Schaefer100x7_mean_aggregation')
df_histograms= storage.read_df('VBM_GM_Histogram_100bins_IXI_hist')
df_demographics = pd.read_csv('./data/participants.tsv',sep='\t')
df_demographics.rename(columns={"participant_id": "subject"}, inplace=True)

In [4]:
df_parcellations.dropna(inplace=True)
df_histograms.dropna(inplace=True)


In [5]:
df_parcellations.columns = df_parcellations.columns.astype(str)
df_histograms.columns = df_histograms.columns.astype(str)

# List of columns for each dataframe
X_parcellations = list(df_parcellations.columns)
X_histograms = list(df_histograms.columns)

# Merge with df_demographics on 'subject'
df_full_parcellations = df_parcellations.merge(df_demographics, on="subject")
df_full_histograms = df_histograms.merge(df_demographics, on="subject")


In [6]:
# For SVM (Support Vector Machine)
creator_svm = PipelineCreator(problem_type="classification")
creator_svm.add("zscore")
creator_svm.add(
    "svm",
    C=(0.001, 100, "log-uniform"),
)

search_params_svm = {
    "kind": "optuna",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# SVM on histograms
scores_hists_svm, model_hists_svm, inspector_hists_svm = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_svm,
    model=creator_svm,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# SVM on parcellations
scores_schaefer_svm, model_schaefer_svm, inspector_schaefer_svm = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_svm,
    model=creator_svm,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)





  warn_with_log(

  pipeline = search(  # type: ignore

  new_object = klass(**new_object_params)

[I 2024-09-25 15:58:36,418] A new study created in memory with name: no-name-991a77b5-8178-4759-9be0-7672b3f9dc8d
[I 2024-09-25 15:58:36,496] Trial 0 finished with value: 0.7071005917159764 and parameters: {'svm__C': 5.288582263013134}. Best is trial 0 with value: 0.7071005917159764.
[I 2024-09-25 15:58:36,572] Trial 1 finished with value: 0.6538461538461539 and parameters: {'svm__C': 0.5432249799570605}. Best is trial 0 with value: 0.7071005917159764.
[I 2024-09-25 15:58:36,656] Trial 2 finished with value: 0.47633136094674555 and parameters: {'svm__C': 0.014335881112944648}. Best is trial 0 with value: 0.7071005917159764.
[I 2024-09-25 15:58:36,783] Trial 3 finished with value: 0.6153846153846154 and parameters: {'svm__C': 0.030068544921613663}. Best is trial 0 with value: 0.7071005917159764.
[I 2024-09-25 15:58:36,861] Trial 4 finished with value: 0.6523668639053255 and parameters: {'s

In [7]:
scores_schaefer_svm

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.901364,0.009396,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.83575,0.978028,0.831858,0.977811,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.871734,0.009344,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.802011,0.96543,0.79646,0.964497,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.824907,0.009115,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.824976,0.992629,0.822222,0.992614,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.859585,0.0088,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.786544,0.98099,0.786667,0.980798,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [8]:
scores_hists_svm

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.906884,0.010225,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.750079,0.801937,0.747788,0.803254,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.840432,0.009787,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.702113,0.745192,0.690265,0.754438,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.873068,0.00981,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.733769,0.783525,0.746667,0.784343,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.791691,0.009187,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",0.693031,0.843308,0.693333,0.846381,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [9]:
# For Random Forest
creator_rf = PipelineCreator(problem_type="classification")
creator_rf.add("zscore")
creator_rf.add(
    "rf",
    max_depth=4,
    n_estimators=100,
    
)

search_params_rf = {
    "kind": "grid",
    "cv": 4
}

# Random Forest on histograms
scores_hists_rf, model_hists_rf, inspector_hists_rf = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_rf,
    model=creator_rf,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Random Forest on parcellations
scores_schaefer_rf, model_schaefer_rf, inspector_schaefer_rf = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_rf,
    model=creator_rf,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

  warn_with_log(

  warn_with_log(



In [10]:
scores_schaefer_rf

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.2169,0.00596,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.757409,0.857439,0.761062,0.859467,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.214345,0.005944,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.70816,0.839973,0.699115,0.846154,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.21495,0.005917,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.77156,0.8488,0.777778,0.849335,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.2175,0.007923,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.715076,0.86041,0.715556,0.864106,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [11]:
scores_hists_rf

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.211987,0.006304,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.716977,0.781481,0.721239,0.784024,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.209961,0.006038,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.663355,0.772207,0.650442,0.782544,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.211473,0.005956,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.662185,0.790347,0.684444,0.791728,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.210046,0.005929,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.662018,0.783034,0.662222,0.788774,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [12]:
# For Extra Trees Classifier
creator_et = PipelineCreator(problem_type="classification")
creator_et.add("zscore")
creator_et.add(
    "et",
    max_depth=5,
    n_estimators=100,
)

search_params_et = {
    "kind": "grid",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# Extra Trees on histograms
scores_hists_et, model_hists_et, inspector_hists_et = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_et,
    model=creator_et,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Extra Trees on parcellations
scores_schaefer_et, model_schaefer_et, inspector_schaefer_et = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_et,
    model=creator_et,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)


  warn_with_log(

  warn_with_log(



In [13]:
scores_schaefer_et

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.072672,0.006222,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.737468,0.825136,0.743363,0.828402,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.072485,0.006172,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.664297,0.789835,0.650442,0.801775,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.070558,0.006132,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.765019,0.817487,0.773333,0.818316,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.070059,0.006173,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.701525,0.827153,0.702222,0.833087,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [14]:
scores_hists_et

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.072912,0.006407,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.699007,0.731588,0.707965,0.736686,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,0.071343,0.00652,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.647962,0.728022,0.632743,0.742604,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,0.071816,0.006288,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.64656,0.741154,0.671111,0.742984,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,0.071123,0.006162,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.652773,0.747728,0.653333,0.757755,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [15]:
# For Gradient Boosting Classifier
creator_gb = PipelineCreator(problem_type="classification")
creator_gb.add("zscore")
creator_gb.add(
    "gradientboost",
    learning_rate = 0.02,
)

search_params_gb = {
    "kind": "grid",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]

# Gradient Boosting on histograms
scores_hists_gb, model_hists_gb, inspector_hists_gb = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_gb,
    model=creator_gb,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)

# Gradient Boosting on parcellations
scores_schaefer_gb, model_schaefer_gb, inspector_schaefer_gb = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_gb,
    model=creator_gb,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)


  warn_with_log(

  warn_with_log(



In [16]:
scores_schaefer_gb

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,1.454587,0.004089,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.81849,0.915254,0.818584,0.91568,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,1.41085,0.00387,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.728226,0.91163,0.721239,0.914201,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,1.420136,0.005157,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.797481,0.906795,0.804444,0.906942,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,1.411913,0.003791,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.719659,0.888452,0.72,0.890694,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [17]:
scores_hists_gb

Unnamed: 0,fit_time,score_time,estimator,test_balanced_accuracy,train_balanced_accuracy,test_accuracy,train_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,1.35095,0.003975,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.653531,0.828978,0.650442,0.829882,676,226,0,0,bc7087515161a73a5a6aff57863f3803
1,1.35893,0.005142,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.692296,0.829899,0.681416,0.837278,676,226,0,1,bc7087515161a73a5a6aff57863f3803
2,1.365598,0.003818,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.675509,0.848617,0.684444,0.849335,677,225,0,2,bc7087515161a73a5a6aff57863f3803
3,1.356142,0.003738,"(SetColumnTypes(X_types={}), StandardScaler(),...",0.644121,0.84157,0.644444,0.844904,677,225,0,3,bc7087515161a73a5a6aff57863f3803


In [18]:
scores_schaefer_svm['model'] = 'AOMIC_Schaefer_SVM'
scores_hists_svm['model'] = 'AOMIC_Histograms_SVM'

scores_schaefer_rf['model'] = 'AOMIC_Schaefer_RF'
scores_hists_rf['model'] = 'AOMIC_Histograms_RF'

scores_schaefer_et['model'] = 'AOMIC_Schaefer_ET'
scores_hists_et['model'] = 'AOMIC_Histograms_ET'

scores_schaefer_gb['model'] = 'AOMIC_Schaefer_GB'
scores_hists_gb['model'] = 'AOMIC_Histograms_GB'


In [19]:
plot_scores(scores_schaefer_svm,scores_hists_svm,scores_schaefer_rf,scores_hists_rf,scores_schaefer_et,scores_hists_et,scores_schaefer_gb,scores_hists_gb)

BokehModel(combine_events=True, render_bundle={'docs_json': {'ca4f1503-6dd1-4375-90ad-49dfd8f4caf3': {'version…

In [23]:
from xgboost import XGBClassifier

In [27]:
df_full_parcellations['sex'] = df_full_parcellations['sex'].map({'male': 1, 'female': 0})


In [28]:
search_params_xgb = {
    "kind": "optuna",
    "cv": 4
}

scoring = ["balanced_accuracy", "accuracy"]


# XGBoost on parcellations
scores_schaefer_xgb, model_schaefer_xgb, inspector_schaefer_xgb = run_cross_validation(
    X=X_parcellations,
    y='sex',
    data=df_full_parcellations,
    search_params=search_params_xgb,
    model= XGBClassifier(tree_method="hist"),
    problem_type='classification',
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
    
)


  warn_with_log(



In [None]:
# XGBoost on histograms
scores_hists_xgb, model_hists_xgb, inspector_hists_xgb = run_cross_validation(
    X=X_histograms,
    y='sex',
    data=df_full_histograms,
    search_params=search_params_xgb,
    model= XGBClassifier(tree_method="hist"),
    problem_type='classification',
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring=scoring,
)
