In [1]:
from junifer.storage import HDF5FeatureStorage
from julearn.api import run_cross_validation
from julearn.pipeline import PipelineCreator
from julearn.viz import plot_scores
from julearn.stats.corrected_ttest import corrected_ttest
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
import numpy as np

It is highly recommended to configure Git before using DataLad. Set both 'user.name' and 'user.email' configuration variables.


In [3]:
with pd.HDFStore('/home/hsreekri/Julearn_predictions/data/IXI_HistogramGMVdata.h5', mode='r') as store:
    df_hists = store['histograms'] 
    df_bins = store['bins']
    
df_hists.index.name = 'subject'
df_hists = df_hists.reset_index()


In [4]:
df_demographics = pd.read_csv('/home/hsreekri/Julearn_predictions/data/IXI_demograpic_data.csv',sep=',')
df_demographics.rename(columns={"IXI_ID": "subject"}, inplace=True)
df_demographics['subject'] = df_demographics['subject'].apply(lambda x: f'sub-IXI{x}')

In [5]:
df_hists.columns = df_hists.columns.astype(str)
X_hists = list(df_hists.columns)
X_hists = X_hists[1:100]
df_full_histograms = df_hists.merge(df_demographics, on="subject")


In [54]:
df_full_histograms = df_full_histograms.dropna()

In [27]:
creator = PipelineCreator(problem_type="classification")
creator.add("zscore")
creator.add(
    "svm",
    C=(1, 100, "log-uniform")
)

<julearn.pipeline.pipeline_creator.PipelineCreator at 0x7fa9cc711e10>

In [28]:
search_params = {
    "kind": "optuna",
    "cv":4
}

scoring = ["neg_mean_absolute_error","neg_mean_squared_error","neg_mean_absolute_percentage_error","balanced_accuracy"]
scores_hists, model_hists, inspector_hists_ridge = run_cross_validation(
    X=X_hists,
    y='SEX_ID (1=m, 2=f)',
    data=df_full_histograms,
    search_params=search_params,
    model=creator,
    return_train_score=True,
    return_inspector=True,
    cv=4,
    scoring = scoring,
)

  warn_with_log(

  pipeline = search(  # type: ignore

  new_object = klass(**new_object_params)

[I 2024-08-15 01:21:19,706] A new study created in memory with name: no-name-c1cc4073-9cbc-47d3-ae93-5d4ea417ba8a
[I 2024-08-15 01:21:19,759] Trial 0 finished with value: 0.5854166666666667 and parameters: {'svm__C': 43}. Best is trial 0 with value: 0.5854166666666667.
[I 2024-08-15 01:21:19,807] Trial 1 finished with value: 0.5853070175438597 and parameters: {'svm__C': 8}. Best is trial 0 with value: 0.5854166666666667.
[I 2024-08-15 01:21:19,853] Trial 2 finished with value: 0.6063048245614036 and parameters: {'svm__C': 6}. Best is trial 2 with value: 0.6063048245614036.
[I 2024-08-15 01:21:19,898] Trial 3 finished with value: 0.621984649122807 and parameters: {'svm__C': 2}. Best is trial 3 with value: 0.621984649122807.
[I 2024-08-15 01:21:19,951] Trial 4 finished with value: 0.5827576754385965 and parameters: {'svm__C': 86}. Best is trial 3 with value: 0.621984649122807.
[I 2024-08-15

In [30]:
scores_hists

Unnamed: 0,fit_time,score_time,estimator,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_mean_absolute_percentage_error,train_neg_mean_absolute_percentage_error,test_balanced_accuracy,train_balanced_accuracy,n_train,n_test,repeat,fold,cv_mdsum
0,0.494356,0.005965,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",-0.377953,-0.204724,-0.377953,-0.204724,-0.311024,-0.173228,0.62699,0.77359,381,127,0,0,bc7087515161a73a5a6aff57863f3803
1,0.4953,0.005748,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",-0.338583,-0.139108,-0.338583,-0.139108,-0.251969,-0.10105,0.639096,0.861086,381,127,0,1,bc7087515161a73a5a6aff57863f3803
2,0.473014,0.005665,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",-0.330709,-0.15748,-0.330709,-0.15748,-0.240157,-0.133858,0.663974,0.834336,381,127,0,2,bc7087515161a73a5a6aff57863f3803
3,0.488479,0.005797,"OptunaSearchCV(cv=KFold(n_splits=4, random_sta...",-0.440945,-0.191601,-0.440945,-0.191601,-0.358268,-0.149606,0.542982,0.802087,381,127,0,3,bc7087515161a73a5a6aff57863f3803


In [31]:
print('MAE for Histograms (KRR)', abs(scores_hists["test_neg_mean_absolute_error"].mean()))
print(abs(scores_hists["test_neg_mean_absolute_percentage_error"].mean()))

MAE for Histograms (KRR) 0.3720472440944882
0.2903543307086614


In [32]:
plot_scores(scores_hists)

BokehModel(combine_events=True, render_bundle={'docs_json': {'ac615060-3278-41a0-b51a-0d4859a6b5a0': {'version…

In [16]:
def return_preds(*inspectors):
    n = len(inspectors)
    fig, axes = plt.subplots(1, n, figsize=(10 * n, 7))
    sns.set_style("darkgrid")
    
    if n == 1:
        axes = [axes]  
    
    for i, inspector in enumerate(inspectors):
        fold_predictions = inspector.folds.predict()
        y_true = fold_predictions['target']
        y_pred = fold_predictions['repeat0_p0']

        mae = format(mean_absolute_error(y_true, y_pred), ".2f")
        corr = format(np.corrcoef(y_pred, y_true)[1, 0], ".2f")
        
        ax = axes[i]
        ax.scatter(y_true, y_pred)
        ax.plot(y_true, y_true, color='red')
        xmin, xmax = ax.get_xlim()
        ymin, ymax = ax.get_ylim()
        text = "MAE: " + str(mae) + "   CORR: " + str(corr)
        ax.set(xlabel="True values", ylabel="Predicted values")
        ax.set_title(f"Actual vs Predicted ({i + 1})")
        ax.text(
            xmax - 0.01 * xmax,
            ymax - 0.01 * ymax,
            text,
            verticalalignment="top",
            horizontalalignment="right",
            fontsize=12,
        )
        #ax.axis("scaled")
    
    plt.tight_layout()
    plt.show()