### Comparison with State-of-the-Art Multimodal Models

We compare the performance of our proposed method **SAMVAE** against several state-of-the-art multimodal survival models on breast cancer, using C-index as the evaluation metric across 5 folds. For each model, we report the mean and standard deviation, as well as the Bonferroni-corrected p-value obtained from statistical comparisons against the best-performing model **BioFusionNet**.

The performance values for all baseline models, including **BioFusionNet**, are as reported in https://doi.org/10.1109/jbhi.2024.3418341


In [8]:
import pickle
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind_from_stats
from statsmodels.stats.multitest import multipletests

# Load results
file_path = '../results/Final_Combinations/Survival_Analysis/brca/clinical_omic_cnv_wsi_patches_15_patch/5_folds_1_batch_size/best_results.pkl'
with open(file_path, 'rb') as f:
    results = pickle.load(f)

# Extract and process central C-index values
best_cis = results['best_cis']
fold_central_values = [[ci[0][1] for ci in fold] for fold in best_cis]
seeds_central = list(zip(*fold_central_values))
seed_means = [np.mean(seed_vals) for seed_vals in seeds_central]
best_seed_idx = int(np.argmax(seed_means))
samvae_values = list(seeds_central[best_seed_idx])

# Create dict with 5-fold C-index values
c_index_data = {
    'SAMVAE': samvae_values[:5],
    "MultiSurv": [0.71, 0.59, 0.60, 0.69, 0.54],
    "MultiDeepCox-SC": [0.71, 0.68, 0.55, 0.58, 0.50],
    "HFBSurv": [0.58, 0.47, 0.56, 0.45, 0.62],
    "PathomicFusion": [0.63, 0.43, 0.56, 0.50, 0.46],
    "MCAT": [0.71, 0.69, 0.64, 0.70, 0.76],
    "TransSurv": [0.70, 0.61, 0.69, 0.69, 0.74],
    "BioFusionNet": [0.78, 0.71, 0.72, 0.81, 0.82],
}

# Create DataFrame and compute mean & std
df = pd.DataFrame(c_index_data)
mean_std = df.agg(['mean', 'std']).T
mean_std['Mean ± Std'] = mean_std.apply(lambda row: f"{row['mean']:.3f} ± {row['std']:.3f}", axis=1)

# Prepare for t-tests against BioFusionNet
n = 5
stats = {model: (row['mean'], row['std']) for model, row in mean_std.iterrows()}
bio_mean, bio_std = stats['BioFusionNet']
raw_pvals = []
models = []

for model, (mean, std) in stats.items():
    if model == 'BioFusionNet':
        continue
    t_stat, p_val = ttest_ind_from_stats(mean1=bio_mean, std1=bio_std, nobs1=n,
                                         mean2=mean, std2=std, nobs2=n,
                                         equal_var=False)
    raw_pvals.append(p_val)
    models.append(model)

# Apply Bonferroni correction
_, pvals_corrected, _, _ = multipletests(raw_pvals, alpha=0.05, method='bonferroni')
bonferroni_dict = dict(zip(models, np.round(pvals_corrected, 3)))
bonferroni_dict['BioFusionNet'] = 1.000  # Replace NaN with 1.000

# Merge and display final result
mean_std['Bonferroni corrected'] = mean_std.index.map(bonferroni_dict)
final_df = mean_std[['Mean ± Std', 'Bonferroni corrected']]
print(final_df)


                    Mean ± Std  Bonferroni corrected
SAMVAE           0.699 ± 0.065                 0.705
MultiSurv        0.626 ± 0.072                 0.057
MultiDeepCox-SC  0.604 ± 0.088                 0.072
HFBSurv          0.536 ± 0.073                 0.004
PathomicFusion   0.516 ± 0.080                 0.005
MCAT             0.700 ± 0.043                 0.366
TransSurv        0.686 ± 0.047                 0.207
BioFusionNet     0.768 ± 0.051                 1.000
