In [1]:
import pandas as pd
import os
import numpy as np
import gc
import torch
import warnings
warnings.filterwarnings('ignore')

# Sklearn imports
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler

# Keras imports
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import backend as K

# Scikeras wrapper for Keras
from scikeras.wrappers import KerasRegressor

# LightGBM
import lightgbm as lgb

# (Hypothetical) TabPFN Regressor
# If the TabPFN package does not provide a regressor, remove or replace this import
from tabpfn import TabPFNRegressor  # Placeholder for a potential TabPFNRegressor

2025-01-24 01:46:16.067595: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-24 01:46:16.082876: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737679576.102342 2208547 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737679576.108315 2208547 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-24 01:46:16.128949: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:


###############################################################################
# MLP Model Definition
###############################################################################
def create_mlp_model(input_shape):
    """
    Create a simple MLP model for regression.
    """
    model = Sequential([
        Dense(1024, activation="relu", input_shape=(input_shape,)),
        Dropout(0.3),
        Dense(512, activation="relu"),
        Dropout(0.3),
        Dense(256, activation="relu"),
        Dropout(0.3),
        Dense(128, activation="relu"),
        Dropout(0.3),
        # Final layer for regression: linear activation, 1 output
        Dense(1, activation="linear")
    ])
    model.compile(optimizer='adam', 
                  loss='mean_squared_error', 
                  metrics=['mean_absolute_error'])
    return model

###############################################################################
# CUDA / Memory Cleanup
###############################################################################
def clean_up_cuda(model):
    """
    Free up GPU memory and clear Keras session.
    """
    # Delete the Keras model
    K.clear_session()
    del model
    
    # Run garbage collection
    gc.collect()
    
    # Free CUDA memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    
    print("CUDA memory cleared and model deleted.")

###############################################################################
# Regression Metrics & Aggregation
###############################################################################
def evaluate_regression_performance(y_true, y_pred):
    """
    Compute regression metrics for predictions.
    Returns a dictionary with MSE, MAE, and R2.
    """
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    results = {
        'mse': mse,
        'mae': mae,
        'r2': r2
    }
    return results

def print_regression_performance(results):
    """
    Print regression performance metrics nicely.
    """
    print(f"MSE: {results['mse']:.4f}")
    print(f"MAE: {results['mae']:.4f}")
    print(f"R²:  {results['r2']:.4f}")

def aggregate_cv_metrics(all_results):
    """
    Aggregate cross-validation metrics (MSE, MAE, R2)
    and return mean + std across folds.
    """
    aggregated = {
        'mse': [],
        'mae': [],
        'r2': []
    }
    
    for result in all_results:
        aggregated['mse'].append(result['mse'])
        aggregated['mae'].append(result['mae'])
        aggregated['r2'].append(result['r2'])
        
    summary = {
        'mean_mse':  np.mean(aggregated['mse']),
        'std_mse':   np.std(aggregated['mse']),
        'mean_mae':  np.mean(aggregated['mae']),
        'std_mae':   np.std(aggregated['mae']),
        'mean_r2':   np.mean(aggregated['r2']),
        'std_r2':    np.std(aggregated['r2']),
    }
    return summary

def print_cv_summary(summary):
    """
    Print the aggregated CV summary (MSE, MAE, R²).
    """
    print(f"Mean MSE:  {summary['mean_mse']:.4f} ± {summary['std_mse']:.4f}")
    print(f"Mean MAE:  {summary['mean_mae']:.4f} ± {summary['std_mae']:.4f}")
    print(f"Mean R²:   {summary['mean_r2']:.4f} ± {summary['std_r2']:.4f}")



In [3]:
df = pd.read_csv("/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/00_NAKO/00_data/deconfounded_but_age/aparc.thickness_aseg.volume_aparc.volume.csv")
label_df = pd.read_csv("/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/00_NAKO/00_data/age_label/all_ages_all_ids_healthy.csv")
n_splits = 5

merged_df = pd.merge(df, label_df, on='ID', how='inner')
merged_df.dropna(inplace=True)
df_sampled, _ = train_test_split(merged_df, train_size=10000, stratify=merged_df["label_age_group"], random_state=42)
df_sampled["label_Age"].value_counts()

y = df_sampled["label_Age"]
col_to_drop = [col for col in label_df.columns]
X = df_sampled.drop(col_to_drop, axis=1)

In [4]:
col_to_drop

['ID', 'Sex', 'label_Age', 'Site', 'label_age_group']

In [5]:
df_control = pd.read_csv("/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/01_Validation_data_set/00_data/final_folder/aparc.thickness_aparc.volume_aseg.volume.csv")
label_df_control = pd.read_csv("/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/01_Validation_data_set/00_data/final_folder/aparc.thickness_aparc.volume_aseg.volume_label.csv")

label_df_control = label_df_control[['ID', 'label_Age']]
df_control = df_control[df.columns]
merged_df_control = pd.merge(df_control, label_df_control, on='ID', how='inner')
merged_df_control.dropna(inplace=True)

X_control = merged_df_control.drop(["ID", "label_Age"], axis=1)
y_control = merged_df_control["label_Age"]

merged_df_control["label_Age"].value_counts()


label_Age
23.00    30
25.00    28
21.00    28
22.00    23
24.00    17
40.00    13
26.00    13
20.75    13
32.00    11
48.00    11
21.75    11
20.25    11
20.50    11
29.00    11
24.25    10
22.75    10
22.50    10
22.25    10
49.00     9
23.75     9
37.00     9
19.75     8
21.50     8
47.00     8
19.00     8
23.25     8
24.75     7
30.00     7
31.00     7
43.00     7
50.00     7
41.00     7
23.50     6
36.00     6
21.25     6
45.00     6
27.00     6
33.00     6
42.00     6
20.00     6
46.00     5
38.00     5
28.00     5
19.25     5
39.00     4
44.00     4
35.00     3
34.00     3
25.50     3
18.25     3
18.75     3
19.50     2
25.25     2
24.50     2
25.75     1
Name: count, dtype: int64

In [6]:
#check len of X and y
print(len(X), len(y))
print(len(X_control), len(y_control))
#columns number
print(X.shape[1], X_control.shape[1])

for col in X.columns:
    if col not in X_control.columns:
        print(col)

10000 10000
478 478
192 192


In [8]:
###############################################################################
# K-Fold Cross-Validation Setup
###############################################################################
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

###############################################################################
# Training & Evaluation
###############################################################################
mlp_results = []
lgb_results = []
tabpfn_results = []
random_results = []

mlp_results_eval = []
lgb_results_eval = []
tabpfn_results_eval = []

model_dict = {}
best_mse_mlp = float('inf')
best_mse_lgb = float('inf')
best_mse_tab = float('inf')

for fold, (train_index, val_index) in enumerate(kf.split(X), 1):
    print(f"\n=== Fold {fold} ===")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled   = scaler.transform(X_val)
    X_control_scaled = scaler.fit_transform(X_control)

    # -------------------------------
    # Random Baseline
    # -------------------------------
    # We'll generate random predictions from a normal distribution 
    # matching the train target's mean and std
    random_predictions = np.random.normal(loc=y_train.mean(), scale=y_train.std(), size=len(y_val))
    random_perf = evaluate_regression_performance(y_val, random_predictions)
    print("Random Baseline Performance:")
    print_regression_performance(random_perf)
    random_results.append(random_perf)
    
    # -------------------------------
    # MLP
    # -------------------------------
    # KerasRegressor or direct model
    mlp_model = create_mlp_model(input_shape=X_train_scaled.shape[1])
    mlp_model.fit(X_train_scaled, y_train, 
                  epochs=10, 
                  batch_size=32,
                  verbose=0)
    
    y_pred_mlp = mlp_model.predict(X_val_scaled).ravel()  # ensure shape (n,)
    mlp_perf = evaluate_regression_performance(y_val, y_pred_mlp)
    print("\nMLP Performance on Validation:")
    print_regression_performance(mlp_perf)
    mlp_results.append(mlp_perf)
    
    # Evaluate on control data
    y_pred_mlp_ctrl = mlp_model.predict(X_control_scaled).ravel()
    mlp_perf_ctrl = evaluate_regression_performance(y_control, y_pred_mlp_ctrl)
    print("MLP Performance on Control:")
    print_regression_performance(mlp_perf_ctrl)
    mlp_results_eval.append(mlp_perf_ctrl)
    
    # Keep best MLP model based on MSE
    if mlp_perf['mse'] < best_mse_mlp:
        best_mse_mlp = mlp_perf['mse']
        model_dict["mlp"] = mlp_model
    
    # Clean up
    clean_up_cuda(mlp_model)

    # -------------------------------
    # (Hypothetical) TabPFN Regressor
    # -------------------------------
    # NOTE: If TabPFNClassifier is the only option, you must skip or replace this.
    try:
        tabclf = TabPFNRegressor()  # Ideally TabPFNRegressor() if available
        tabclf.fit(X_train_scaled, y_train)
        y_pred_tab = tabclf.predict(X_val_scaled)  # For regression, this should be continuous
        tab_perf = evaluate_regression_performance(y_val, y_pred_tab)
        print("\nTabPFN Regressor Performance on Validation:")
        print_regression_performance(tab_perf)
        tabpfn_results.append(tab_perf)
        
        # Evaluate on control data
        y_pred_tab_ctrl = tabclf.predict(X_control_scaled)
        tab_perf_ctrl = evaluate_regression_performance(y_control, y_pred_tab_ctrl)
        print("TabPFN Regressor Performance on Control:")
        print_regression_performance(tab_perf_ctrl)
        tabpfn_results_eval.append(tab_perf_ctrl)
        
        if tab_perf['mse'] < best_mse_tab:
            best_mse_tab = tab_perf['mse']
            model_dict["tabpfn"] = tabclf
        
        clean_up_cuda(tabclf)
    except Exception as e:
        print("TabPFN Regressor not available or failed. Skipping...")
        print(e)
    
    # -------------------------------
    # LightGBM
    # -------------------------------
    lgb_train = lgb.Dataset(X_train_scaled, label=y_train)
    lgb_eval  = lgb.Dataset(X_val_scaled,   label=y_val, reference=lgb_train)
    
    lgb_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'seed': 42
    }
    
    lgbclf = lgb.train(
        params=lgb_params, 
        train_set=lgb_train, 
        valid_sets=[lgb_train, lgb_eval], 
        num_boost_round=1000
    )
    
    y_pred_lgb = lgbclf.predict(X_val_scaled)
    lgb_perf = evaluate_regression_performance(y_val, y_pred_lgb)
    print("\nLightGBM Performance on Validation:")
    print_regression_performance(lgb_perf)
    lgb_results.append(lgb_perf)
    
    # Evaluate on control data
    y_pred_lgb_ctrl = lgbclf.predict(X_control_scaled)
    lgb_perf_ctrl = evaluate_regression_performance(y_control, y_pred_lgb_ctrl)
    print("LightGBM Performance on Control:")
    print_regression_performance(lgb_perf_ctrl)
    lgb_results_eval.append(lgb_perf_ctrl)
    
    if lgb_perf['mse'] < best_mse_lgb:
        best_mse_lgb = lgb_perf['mse']
        model_dict["lgb"] = lgbclf
    
    clean_up_cuda(lgbclf)





=== Fold 1 ===
Random Baseline Performance:
MSE: 320.2751
MAE: 14.4533
R²:  -1.0360
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

MLP Performance on Validation:
MSE: 79.8211
MAE: 6.7917
R²:  0.4926
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
MLP Performance on Control:
MSE: 386.5899
MAE: 17.6893
R²:  -3.8075
CUDA memory cleared and model deleted.

TabPFN Regressor Performance on Validation:
MSE: 36.0748
MAE: 4.7365
R²:  0.7707
TabPFN Regressor Performance on Control:
MSE: 532.5100
MAE: 20.4882
R²:  -5.6221
CUDA memory cleared and model deleted.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006750 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 47482
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 188
[LightGBM] [Info] Start training from score 48.651500

LightGBM Performance on Validation:
MSE: 42.7

In [10]:
###############################################################################
# Print Cross-Validation Summaries
###############################################################################
print("\n=== Cross-Validation Summary (Random Baseline) ===")
random_summary = aggregate_cv_metrics(random_results)
print_cv_summary(random_summary)

print("\n=== Cross-Validation Summary (MLP) ===")
mlp_summary = aggregate_cv_metrics(mlp_results)
print_cv_summary(mlp_summary)

print("\n=== Cross-Validation Summary (TabPFN) ===")
if tabpfn_results:
    tabpfn_summary = aggregate_cv_metrics(tabpfn_results)
    print_cv_summary(tabpfn_summary)
else:
    print("No TabPFN results recorded.")

print("\n=== Cross-Validation Summary (LightGBM) ===")
lgb_summary = aggregate_cv_metrics(lgb_results)
print_cv_summary(lgb_summary)


=== Cross-Validation Summary (Random Baseline) ===
Mean MSE:  313.1276 ± 6.4556
Mean MAE:  14.1977 ± 0.1936
Mean R²:   -1.0422 ± 0.0375

=== Cross-Validation Summary (MLP) ===
Mean MSE:  63.6548 ± 8.7813
Mean MAE:  6.2862 ± 0.3217
Mean R²:   0.5857 ± 0.0499

=== Cross-Validation Summary (TabPFN) ===
Mean MSE:  35.0199 ± 1.5113
Mean MAE:  4.6340 ± 0.0964
Mean R²:   0.7716 ± 0.0086

=== Cross-Validation Summary (LightGBM) ===
Mean MSE:  41.9693 ± 1.4864
Mean MAE:  5.1293 ± 0.0970
Mean R²:   0.7263 ± 0.0087


In [None]:

###############################################################################
# Example: Load a saved model & evaluate on control data
###############################################################################
# If you have a saved regression model:
"""
import pickle
save_dir = "../98_models/"
with open(os.path.join(save_dir, "best_regressor.pkl"), "rb") as f:
    loaded_model = pickle.load(f)
    # For example, if it's a LightGBM model, you can just do:
    y_pred_control = loaded_model.predict(X_control_scaled)
    performance_control = evaluate_regression_performance(y_control, y_pred_control)
    print("\nLoaded Model Performance on Control Data:")
    print_regression_performance(performance_control)
"""
