In [None]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from tabpfn import TabPFNClassifier
import os
import numpy as np
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import label_binarize

# Load data
df = pd.read_csv("/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/00_NAKO/00_data/deconfounded_but_age/aseg.volume_aparc.volume_aparc.thickness.csv")
label_df = pd.read_csv("/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/00_NAKO/00_data/age_label/all_ages.csv")
label_df = label_df[['ID', 'label_age_group']]

# Merge dataframes
merged_df = pd.merge(df, label_df, on='ID', how='inner')
merged_df.dropna(inplace=True)

# Stratified sampling to maintain label distribution
df_sampled, _ = train_test_split(merged_df, train_size=10000, stratify=merged_df["label_age_group"], random_state=42)

# Drop specific label
df_sampled = df_sampled[df_sampled.label_age_group != 10]

# Prepare features and target
y = df_sampled["label_age_group"]
X = df_sampled.drop(["ID", "label_age_group"], axis=1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)

# Initialize classifiers
models = {
    "TabPFN": TabPFNClassifier(),
    "LightGBM": lgb.LGBMClassifier(objective='multiclass', num_class=len(y.unique()), num_leaves=31, learning_rate=0.05, feature_fraction=0.9, random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=200, random_state=42)
}

# Training and evaluation
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Handle different class attributes (LightGBM does not have `classes_`)
    model_classes = model.classes_ if hasattr(model, 'classes_') else np.unique(y_train)

    # Calculate ROC AUC if model supports probability prediction
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test)
        y_test_bin = label_binarize(y_test, classes=model_classes)
        auc = roc_auc_score(y_test_bin, y_pred_proba[:, :len(model_classes)], multi_class='ovr', average='macro')
    else:
        auc = "N/A (no probability predictions available)"

    acc = accuracy_score(y_test, y_pred)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    # Print results
    print(f"{name} Results:")
    print(f"ROC AUC: {auc}")
    print(f"Accuracy: {acc}")
    print(f"Balanced Accuracy: {balanced_acc}")
    print(report)




In [None]:
# Validation on control dataset
print("\nValidating models on control dataset...")
df_control = pd.read_csv("/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/01_Validation_data_set/00_data/deconfounded_but_age/aseg.volume_aparc.thickness_aparc.volume.csv")
label_df_control = pd.read_csv("/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/01_Validation_data_set/00_data/age_label/all_ages.csv")

label_df_control = label_df_control[['ID', 'label_age_group']]
df_control = df_control[df.columns]  # Ensure same features

merged_df_control = pd.merge(df_control, label_df_control, on='ID', how='inner')
merged_df_control.dropna(inplace=True)

X_control = merged_df_control.drop(["ID", "label_age_group"], axis=1)
y_control = merged_df_control["label_age_group"]

for name, model in models.items():
    print(f"\nValidating {name}...")

    y_pred_control = model.predict(X_control)

    # Handle different class attributes (LightGBM does not have `classes_`)
    model_classes = model.classes_ if hasattr(model, 'classes_') else np.unique(y_control)

    if hasattr(model, "predict_proba"):
        y_pred_proba_control = model.predict_proba(X_control)

        # Ensure probability predictions align with actual classes in control set
        present_classes = np.unique(y_control)
        class_indices = [np.where(model_classes == cls)[0][0] for cls in present_classes]
        y_pred_proba_filtered = y_pred_proba_control[:, class_indices]

        # Binarize y_control using the present classes
        y_control_bin = label_binarize(y_control, classes=present_classes)

        auc_control = roc_auc_score(y_control_bin, y_pred_proba_filtered, multi_class='ovr', average='macro')
    else:
        auc_control = "N/A (no probability predictions available)"

    acc_control = accuracy_score(y_control, y_pred_control)
    balanced_acc_control = balanced_accuracy_score(y_control, y_pred_control)
    report_control = classification_report(y_control, y_pred_control)

    print(f"{name} Control Results:")
    print(f"ROC AUC: {auc_control}")
    print(f"Accuracy: {acc_control}")
    print(f"Balanced Accuracy: {balanced_acc_control}")
    print(report_control)

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score, classification_report, make_scorer
from sklearn.model_selection import train_test_split
from tabpfn import TabPFNClassifier
import os
import numpy as np
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping

In [None]:

def create_mlp_model(input_shape, num_classes):
    model = Sequential([
        Dense(1024, activation="relu", input_shape=(input_shape,)),
        Dropout(0.3),
        Dense(512, activation="relu"),
        Dropout(0.3),
        Dense(256, activation="relu"),
        Dropout(0.3),
        Dense(128, activation="relu"),
        Dropout(0.3),
        Dense(num_classes, activation="softmax")
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

#make scorer for balanced accuracy
scoring = {
    'accuracy': 'accuracy',
    'balanced_accuracy': 'balanced_accuracy',
    'roc_auc_ovr': make_scorer(roc_auc_score, multi_class='ovr', average='macro')
}

# Load data
df = pd.read_csv("/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/00_NAKO/00_data/deconfounded_but_age/aseg.volume_aparc.volume_aparc.thickness.csv")
label_df = pd.read_csv("/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/00_NAKO/00_data/age_label/all_ages.csv")
label_df = label_df[['ID', 'label_age_group']]

# Merge dataframes
merged_df = pd.merge(df, label_df, on='ID', how='inner')
merged_df.dropna(inplace=True)

# Stratified sampling to maintain label distribution
df_sampled, _ = train_test_split(merged_df, train_size=10000, stratify=merged_df["label_age_group"], random_state=42)

# Drop specific label
df_sampled = df_sampled[df_sampled.label_age_group != 10]

y = df_sampled["label_age_group"]
X = df_sampled.drop(["ID", "label_age_group"], axis=1)

num_classes = len(y.unique())
input_shape = X.shape[1]

print(f"Number of classes: {num_classes}")
print(f"Number of features: {input_shape}")

input_shape = X_train.shape[1]
num_classes = len(y_train.unique()) 



In [None]:
from tensorflow.keras.utils import to_categorical
# Create pipelines for each model that include normalization
def create_keras_mlp_wrapper(input_shape, num_classes):
    def wrapper_model():
        return create_mlp_model(input_shape, num_classes)
    return wrapper_model
pipelines = {
    "LightGBM": Pipeline([('scaler', StandardScaler()), ('model', lgb.LGBMClassifier(objective='multiclass', num_class=len(y.unique()), num_leaves=31, learning_rate=0.05, feature_fraction=0.9, random_state=42))]),
    "TabPFN": Pipeline([('scaler', StandardScaler()), ('model', TabPFNClassifier())]),
    
  
}

# Set up cross-validation
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
y_onehot = to_categorical(y)
# Training and evaluation with cross-validation
for name, pipeline in pipelines.items():
    print(f"\nTraining and evaluating {name} with cross-validation...")
    
    # Perform cross-validation
    cv_scores = cross_validate(pipeline, X, y, cv=cv, scoring=scoring)
    
    print(f"{name} Cross-validation Results:")
    for metric in scoring.keys():
        mean_score = cv_scores[f'test_{metric}'].mean()
        std_score = cv_scores[f'test_{metric}'].std()
        print(f"Mean {metric}: {mean_score:.4f} (+/- {std_score * 2:.4f})")

    # Fit the model on the entire dataset for later use
    pipeline.fit(X, y)


In [None]:
# Validation on control dataset
print("\nValidating models on control dataset...")
X_control = merged_df_control.drop(["ID", "label_age_group"], axis=1)
y_control = merged_df_control["label_age_group"]

for name, pipeline in pipelines.items():
    print(f"\nValidating {name}...")

    y_pred_control = pipeline.predict(X_control)
    
    # Get the trained model from the pipeline
    model = pipeline.named_steps['model']

    # Handle different class attributes
    model_classes = model.classes_ if hasattr(model, 'classes_') else np.unique(y)

    if hasattr(model, "predict_proba"):
        y_pred_proba_control = pipeline.predict_proba(X_control)

        # Ensure probability predictions align with actual classes in control set
        present_classes = np.unique(y_control)
        class_indices = [np.where(model_classes == cls)[0][0] for cls in present_classes]
        y_pred_proba_filtered = y_pred_proba_control[:, class_indices]

        # Binarize y_control using the present classes
        y_control_bin = label_binarize(y_control, classes=present_classes)

        auc_control = roc_auc_score(y_control_bin, y_pred_proba_filtered, multi_class='ovr', average='macro')
    else:
        auc_control = "N/A (no probability predictions available)"

    acc_control = accuracy_score(y_control, y_pred_control)
    balanced_acc_control = balanced_accuracy_score(y_control, y_pred_control)
    report_control = classification_report(y_control, y_pred_control)

    print(f"{name} Control Results:")
    print(f"ROC AUC: {auc_control}")
    print(f"Accuracy: {acc_control}")
    print(f"Balanced Accuracy: {balanced_acc_control}")
    print(report_control)

: 