In [1]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from tabpfn import TabPFNClassifier
import pandas as pd
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier
import numpy as np
from sklearn.metrics import balanced_accuracy_score, classification_report
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import label_binarize
import gc
import torch
from tensorflow.keras import backend as K
import warnings
warnings.filterwarnings('ignore')

2025-01-24 19:06:39.368345: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-24 19:06:39.383688: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737741999.402771 2490994 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737741999.408537 2490994 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-24 19:06:39.428587: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
def create_mlp_model(input_shape, num_classes):
    model = Sequential([
        Dense(1024, activation="relu", input_shape=(input_shape,)),
        Dropout(0.3),
        Dense(512, activation="relu"),
        Dropout(0.3),
        Dense(256, activation="relu"),
        Dropout(0.3),
        Dense(128, activation="relu"),
        Dropout(0.3),
        Dense(num_classes, activation="softmax")
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [3]:
def clean_up_cuda(model):
    # Delete the Keras model
    K.clear_session()
    del model
    
    # Run garbage collection
    gc.collect()
    
    # Free CUDA memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    
    print("CUDA memory cleared and model deleted.")

In [4]:
# Example usage
def print_model_performance(results):
    """
    Print model performance metrics
    
    Parameters:
    results (dict): Performance metrics from evaluate_model_performance()
    """
    for metric, value in results.items():
        if metric == 'classification_report':
            print("\nClassification Report:")
            print(value)
        else:
            print(f"{metric.replace('_', ' ').title()}: {value}")
def aggregate_cv_metrics_and_print(all_results, model_name, tag="Validation"):
    """
    Aggregate cross-validation metrics
    
    Parameters:
    all_results (list): List of results dictionaries from each fold
    
    Returns:
    dict: Aggregated metrics with means and standard deviations
    """
    # Initialize aggregation dictionary
    aggregated = {
        'accuracy': [],
        'balanced_accuracy': [],
        'random_balanced_accuracy': [],
        'roc_auc': []
    }
    
    # Collect metrics from each fold
    for result in all_results:
        aggregated['accuracy'].append(result['accuracy'])
        aggregated['balanced_accuracy'].append(result['balanced_accuracy'])
        aggregated['random_balanced_accuracy'].append(result['random_balanced_accuracy'])
        aggregated['roc_auc'].append(result['roc_auc'])
    # Compute mean and standard deviation
    summary = {
        'mean_accuracy': np.mean(aggregated['accuracy']),
        'std_accuracy': np.std(aggregated['accuracy']),
        'mean_balanced_accuracy': np.mean(aggregated['balanced_accuracy']),
        'std_balanced_accuracy': np.std(aggregated['balanced_accuracy']),
        'mean_random_balanced_accuracy': np.mean(aggregated['random_balanced_accuracy']),
        'std_random_balanced_accuracy': np.std(aggregated['random_balanced_accuracy']),
        'mean_roc_auc': np.mean(aggregated['roc_auc']),
        'std_roc_auc': np.std(aggregated['roc_auc'])
    }
    
    print(f"\n {model_name} Classifier Performance {tag}:")
    print_model_performance(summary)
    return summary

In [5]:
df = pd.read_csv("/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/00_NAKO/00_data/deconfounded_but_age/aparc.thickness_aseg.volume_aparc.volume.csv")
label_df = pd.read_csv("/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/00_NAKO/00_data/age_label/all_ages_all_ids_healthy.csv")
n_splits = 5


label_df = label_df[['ID', 'label_age_group']]
merged_df = pd.merge(df, label_df, on='ID', how='inner')
merged_df.dropna(inplace=True)
df_sampled, _ = train_test_split(merged_df, train_size=10000, stratify=merged_df["label_age_group"], random_state=42)
df_sampled["label_age_group"].value_counts()

label_age_group
2.0    2931
3.0    2722
4.0    2056
1.0    1124
0.0    1020
5.0     147
Name: count, dtype: int64

In [6]:
df_control = pd.read_csv("/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/01_Validation_data_set/00_data/final_folder/aparc.thickness_aparc.volume_aseg.volume.csv")
label_df_control = pd.read_csv("/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/01_Validation_data_set/00_data/final_folder/aparc.thickness_aparc.volume_aseg.volume_label.csv")

label_df_control = label_df_control[['ID', 'label_age_group']]
df_control = df_control[df.columns]
merged_df_control = pd.merge(df_control, label_df_control, on='ID', how='inner')
merged_df_control.dropna(inplace=True)

X_control = merged_df_control.drop(["ID", "label_age_group"], axis=1)
y_control = merged_df_control["label_age_group"]

merged_df_control["label_age_group"].value_counts()


label_age_group
0.0    320
2.0     76
1.0     61
3.0      7
Name: count, dtype: int64

In [7]:
def evaluate_model_performance_train(y_test, y_pred, y_pred_proba, y_val_bin=None):
    # Compute basic metrics
    acc = accuracy_score(y_test, y_pred)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Random comparison
    n_classes = len(np.unique(y_test))
    random_y_test = np.random.randint(0, n_classes, size=y_test.shape)
    random_balanced_acc = balanced_accuracy_score(random_y_test, y_pred)
    
    # ROC AUC (if probabilities provided)
    if y_val_bin is not None:
        y_test = y_val_bin
    auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')
    
    # Prepare results
    results = {
        'accuracy': acc,
        'balanced_accuracy': balanced_acc,
        'random_balanced_accuracy': random_balanced_acc,
        'classification_report': report
    }
    
    if auc is not None:
        results['roc_auc'] = auc
    
    return results, balanced_acc




In [8]:
def predict_and_evaluate(model, X_val, y_val, original_classes=None, multi_class=False):
    if multi_class:
        y_pred_proba = model.predict(X_val)
        y_pred = np.argmax(y_pred_proba, axis=1)
        #print(y_pred)
    else:
        y_pred_proba = model.predict_proba(X_val)
        y_pred = model.predict(X_val)
        #print(y_pred)
    
    # Get unique classes present in validation data
    present_classes = np.unique(y_val)
    
    # Get the indices of these classes in the original prediction probabilities
    class_indices = [np.where(original_classes == cls)[0][0] for cls in present_classes]
    
    # Select only the probability columns for present classes
    y_pred_proba_filtered = y_pred_proba[:, class_indices]
    
    # Binarize the true labels using only the present classes
    y_val_bin = label_binarize(y_val, classes=present_classes)

    results, balanced_acc = evaluate_model_performance_train(y_val, y_pred, y_pred_proba_filtered, y_val_bin)
    print_model_performance(results)
    return results, balanced_acc

In [None]:

percentage_of_the_data = [0.01, 0.03, 0.05, 0.1, 0.15, 0.5, 1]
percentage_dict = {}
for percentage in percentage_of_the_data:

        if percentage == 1:
                print(f"\n #### TRAINING WITH {percentage} OF THE DATA ####")
                df_sampled_subset = df_sampled
        else:
                print(f"\n #### TRAINING WITH {percentage} OF THE DATA ####")
                df_sampled_subset, _ = train_test_split(
                df_sampled,
                train_size=percentage,  # Use train_size to get desired percentage
                stratify=df_sampled["label_age_group"],
                random_state=42
                )

        y = df_sampled_subset["label_age_group"]
        X = df_sampled_subset.drop(["ID", "label_age_group"], axis=1)

        print(f"Training data shape: {X.shape}, length of y: {len(y)}")
        print(f"Training data class distribution: {y.value_counts()}")
        

        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        cv_results = {
                'accuracy': [],
                'balanced_accuracy': [],
                'roc_auc': [],
                'classification_reports': []
        }

        tab_pfn = TabPFNClassifier()

        tabpfn_results = []
        tabpfn_results_eval = []
        lgb_results = []
        lgb_results_eval = []
        random_results = []
        mlp_results = []
        mlp_results_eval = []
        model_dict = {}
        model_results = {}


        best_balanced_accuracy_mlp = 0
        best_balanced_accuracy_tabpfn = 0
        best_balanced_accuracy_lgb = 0
        for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
                unique_classes = np.unique(y)
                missing_classes = [cls for cls in unique_classes if cls not in y.iloc[val_index]]
                for cls in missing_classes:
                        cls_indices = np.where(y == cls)[0]  # Get all indices of the missing class
                        # Check if removing a sample would leave train set empty for the class
                        train_cls_indices = np.intersect1d(cls_indices, train_index)

                        if len(train_cls_indices) <= 1:
                                # If moving the last one, instead take a duplicate from the whole y array
                                cls_idx_to_move = np.random.choice(cls_indices, 1)[0]
                        else:
                                cls_idx_to_move = np.random.choice(train_cls_indices, 1)[0]
                        # Add to validation set
                        val_index = np.append(val_index, cls_idx_to_move)
                        # Remove only if it's not the last one in train
                        if len(train_cls_indices) > 1:
                                train_index = np.setdiff1d(train_index, cls_idx_to_move)
                print(f"\nFold {fold}")
                X_train, X_test = X.iloc[train_index], X.iloc[val_index]
                y_train, y_test = y.iloc[train_index], y.iloc[val_index]

                #scaler = MinMaxScaler()
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)
                
                X_control = scaler.fit_transform(X_control)

                n_classes = len(np.unique(y_test))
                random_y_test = np.random.randint(0, n_classes, size=y_test.shape)
                random_y_pred_proba = np.random.rand(len(y_test), n_classes)
                random_y_pred_proba /= random_y_pred_proba.sum(axis=1)[:, np.newaxis]
                results, balanced_accuracy  = evaluate_model_performance_train(y_test, random_y_test, random_y_pred_proba)
                print("RANDOM PERFORMANCE")
                print_model_performance(results)
                random_results.append(results)

                mlpclf = create_mlp_model(input_shape=X_train.shape[1], num_classes=len(y.unique()))
                mlpclf.fit(X_train, pd.get_dummies(y_train), epochs=10, batch_size=32, verbose=0)
                y_pred_proba = mlpclf.predict(X_test)
                y_pred = np.argmax(y_pred_proba, axis=1)
                results, balanced_accuracy = evaluate_model_performance_train(y_test, y_pred, y_pred_proba)
                print("MLP PERFORMANCE")
                print_model_performance(results)
                mlp_results.append(results)
                #model_dict["mlp"] = mlpclf
                print("MLP PERFORMANCE FOR CONTROL")
                results, balanced_accuracy = predict_and_evaluate(mlpclf, X_control, y_control, original_classes = np.unique(y_train), multi_class=True)
                mlp_results_eval.append(results)
                if balanced_accuracy > best_balanced_accuracy_mlp:
                        best_balanced_accuracy_mlp = balanced_accuracy
                        model_dict["mlp"] = mlpclf
                clean_up_cuda(mlpclf)

                tabclf = TabPFNClassifier()
                tabclf.fit(X_train, y_train)
                y_pred_proba = tabclf.predict_proba(X_test)
                y_pred = tabclf.predict(X_test)
                results, balanced_accuracy = evaluate_model_performance_train(y_test, y_pred, y_pred_proba)
                print("tabpfn PERFORMANCE")
                print_model_performance(results)
                tabpfn_results.append(results)
                #model_dict["tabpfn"] = tabclf
                original_classes = tabclf.classes_
                print("tabpfn PERFORMANCE FOR CONTROL")
                results, balanced_accuracy = predict_and_evaluate(tabclf, X_control, y_control, original_classes=original_classes)
                tabpfn_results_eval.append(results)
                if balanced_accuracy > best_balanced_accuracy_tabpfn:
                        best_balanced_accuracy_tabpfn = balanced_accuracy
                        model_dict["tabpfn"] = tabclf
                clean_up_cuda(tabclf)
                
                lgb_train = lgb.Dataset(X_train, label=y_train)
                lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)
                params = {
                'objective': 'multiclass',
                'num_class': len(y.unique()),
                'metric': 'multi_logloss',
                'num_leaves': 31,
                'learning_rate': 0.05,
                'feature_fraction': 0.9,
                'seed': 42,
                'verbose': -1
                }
                lgbclf = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], num_boost_round=1000)
                y_pred_proba = lgbclf.predict(X_test)
                y_pred = np.argmax(y_pred_proba, axis=1)
                results, balanced_accuracy = evaluate_model_performance_train(y_test, y_pred, y_pred_proba)
                print("LGBM PERFORMANCE")
                print_model_performance(results)
                lgb_results.append(results)
                print("LGBM PERFORMANCE FOR CONTROL")
                results, balanced_accuracy = predict_and_evaluate(lgbclf, X_control, y_control, original_classes=original_classes, multi_class=True)
                lgb_results_eval.append(results)
                if balanced_accuracy > best_balanced_accuracy_lgb:
                        best_balanced_accuracy_lgb = balanced_accuracy
                        model_dict["lgb"] = lgbclf
                clean_up_cuda(lgbclf)

        random_summary = aggregate_cv_metrics_and_print(random_results, "Random")
        tabpfn_summary = aggregate_cv_metrics_and_print(tabpfn_results, "TabPFN")
        lgb_summary = aggregate_cv_metrics_and_print(lgb_results, "LGBM")
        mlp_summary = aggregate_cv_metrics_and_print(mlp_results, "MLP")

        tabpfn_eval_summary = aggregate_cv_metrics_and_print(tabpfn_results_eval, "TabPFN", "Control")
        lgb_eval_summary = aggregate_cv_metrics_and_print(lgb_results_eval, "LGBM", "Control")
        mlp_eval_summary = aggregate_cv_metrics_and_print(mlp_results_eval, "MLP", "Control")

        percentage_dict[percentage] = {
        "TabPFN": {
                "results": tabpfn_summary,
                "results_eval": tabpfn_eval_summary,
                "cv_results": tabpfn_results,
                "cv_results_eval": tabpfn_results_eval
        },
        "LGBM": {
                "results": lgb_summary,
                "results_eval": lgb_eval_summary,
                "cv_results": lgb_results,
                "cv_results_eval": lgb_results_eval
        },
        "Random": {
                "results": random_summary,
                "cv_results": random_results
        },
        "MLP": {
                "results": mlp_summary,
                "results_eval": mlp_eval_summary,
                "cv_results": mlp_results,
                "cv_results_eval": mlp_results_eval
        }
    }
        



 #### TRAINING WITH 0.01 OF THE DATA ####
Training data shape: (100, 192), length of y: 100
Training data class distribution: label_age_group
2.0    29
3.0    27
4.0    21
1.0    11
0.0    10
5.0     2
Name: count, dtype: int64

Fold 1
RANDOM PERFORMANCE
Accuracy: 0.2692307692307692
Balanced Accuracy: 0.26190476190476186
Random Balanced Accuracy: 0.19444444444444445

Classification Report:
              precision    recall  f1-score   support

         0.0       0.25      0.33      0.29         3
         1.0       0.67      0.67      0.67         3
         2.0       0.33      0.29      0.31         7
         3.0       0.33      0.29      0.31         7
         4.0       0.00      0.00      0.00         5
         5.0       0.00      0.00      0.00         1

    accuracy                           0.27        26
   macro avg       0.26      0.26      0.26        26
weighted avg       0.29      0.27      0.28        26

Roc Auc: 0.40174348915767677


W0000 00:00:1737742005.629677 2490994 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
MLP PERFORMANCE
Accuracy: 0.3076923076923077
Balanced Accuracy: 0.2507936507936508
Random Balanced Accuracy: 0.09999999999999999

Classification Report:
              precision    recall  f1-score   support

         0.0       0.50      0.33      0.40         3
         1.0       0.00      0.00      0.00         3
         2.0       0.25      0.43      0.32         7
         3.0       0.20      0.14      0.17         7
         4.0       0.50      0.60      0.55         5
         5.0       0.00      0.00      0.00         1

    accuracy                           0.31        26
   macro avg       0.24      0.25      0.24        26
weighted avg       0.28      0.31      0.28        26

Roc Auc: 0.6700802731466347
MLP PERFORMANCE FOR CONTROL
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Accuracy: 0.16163793103448276
Balanced Accuracy: 0.22136463083939356
Random Balanced Accuracy: 0.19873

In [None]:
for percentage, models in percentage_dict.items():
    print(f"\nResults for {percentage*100:.0f}% of the data:")
    for model, results in models.items():
        print(f"  {model} - Results: {results['results']}")
        if 'results_eval' in results:
            print(f"  {model} - Evaluation Results: {results['results_eval']}")

In [15]:
""" #load a model
import pickle
import os
save_dir = "../98_models/"
with open(os.path.join(save_dir, "tabpfn.pkl"), "rb") as f:
    model = pickle.load(f)
    original_classes = np.unique(y_control)
    results, balanced_accuracy = predict_and_evaluate(model, X_control, y_control, original_classes=original_classes)
    print_model_performance(results) """

Accuracy: 0.6896551724137931
Balanced Accuracy: 0.25
Random Balanced Accuracy: 0.25

Classification Report:
              precision    recall  f1-score   support

         0.0       0.69      1.00      0.82       320
         1.0       0.00      0.00      0.00        61
         2.0       0.00      0.00      0.00        76
         3.0       0.00      0.00      0.00         7

    accuracy                           0.69       464
   macro avg       0.17      0.25      0.20       464
weighted avg       0.48      0.69      0.56       464

Roc Auc: 0.7635391042080081
Accuracy: 0.6896551724137931
Balanced Accuracy: 0.25
Random Balanced Accuracy: 0.25

Classification Report:
              precision    recall  f1-score   support

         0.0       0.69      1.00      0.82       320
         1.0       0.00      0.00      0.00        61
         2.0       0.00      0.00      0.00        76
         3.0       0.00      0.00      0.00         7

    accuracy                           0.69      