In [7]:
# %% [code]
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from scipy.stats import norm

# --- DeLong test functions ---

def compute_midrank(x):
    """
    Computes midranks.
    """
    J = np.argsort(x)
    Z = x[J]
    T = np.empty(len(x), dtype=float)
    i = 0
    while i < len(x):
        j = i
        while j < len(x) and Z[j] == Z[i]:
            j += 1
        # assign the average rank for tied values
        T[i:j] = 0.5 * (i + j - 1)
        i = j
    T2 = np.empty(len(x), dtype=float)
    T2[J] = T
    return T2

def fastDeLong(predictions_sorted_transposed, label_1_count):
    """
    Fast DeLong method for ROC AUC variance estimation.
    predictions_sorted_transposed: 2D array (k x m) where each row corresponds to a classifier’s predictions (here k=1).
    label_1_count: number of positive samples.
    """
    m = predictions_sorted_transposed.shape[1]
    n = label_1_count
    k = predictions_sorted_transposed.shape[0]
    tx = np.empty((k, n))
    ty = np.empty((k, m - n))
    for r in range(k):
        tx[r, :] = compute_midrank(predictions_sorted_transposed[r, :n])
        ty[r, :] = compute_midrank(predictions_sorted_transposed[r, n:])
    tz = np.hstack((tx, ty))
    aucs = tx.sum(axis=1) / (n * (m - n))
    # variance estimation (per DeLong)
    v01 = (tz[:, :n] - np.expand_dims(aucs, axis=1)) ** 2
    sx = np.sum(v01, axis=1) / (n - 1)
    return aucs, sx

def delong_roc_variance(y_true, predictions):
    # Separate positive and negative predictions
    pos_preds = predictions[y_true == 1]
    neg_preds = predictions[y_true == 0]
    m = len(pos_preds) + len(neg_preds)
    n = len(pos_preds)
    # Combine them in an array where positives come first
    predictions_combined = np.concatenate([pos_preds, neg_preds])
    predictions_combined = predictions_combined.reshape(1, -1)
    aucs, sx = fastDeLong(predictions_combined, n)
    return aucs[0], sx[0]


def delong_roc_test(y_true, preds1, preds2):
    """
    Compares two sets of prediction scores using the DeLong test.
    Returns:
      p_value, auc1, auc2
    """
    auc1, var1 = delong_roc_variance(y_true, preds1)
    auc2, var2 = delong_roc_variance(y_true, preds2)
    auc_diff = auc1 - auc2
    # For a paired test the variance of the difference ideally uses the covariance;
    # here we assume independence for simplicity.
    var_diff = var1 + var2  
    z = np.abs(auc_diff) / np.sqrt(var_diff)
    p_value = 2 * (1 - norm.cdf(z))
    return p_value, auc1, auc2

# %% [code]
# Define file paths for each ensemble file
external_avg_path = r"C:\Users\alime\Dropbox (AMC)\Mon PC (DESKTOP-RG9FHVT)\Desktop\Deauville\Deauville_DeepLearning\prediction\external\ensemble_avg.csv"
scratch_avg_path  = r"C:\Users\alime\Dropbox (AMC)\Mon PC (DESKTOP-RG9FHVT)\Desktop\Deauville\Deauville_DeepLearning\prediction\scratch\Run10\ensemble_avg.csv"
transfer_avg_path = r"C:\Users\alime\Dropbox (AMC)\Mon PC (DESKTOP-RG9FHVT)\Desktop\Deauville\Deauville_DeepLearning\prediction\transfer\Run12\ensemble_avg.csv"

# Load each CSV
df_external = pd.read_csv(external_avg_path)
df_scratch  = pd.read_csv(scratch_avg_path)
df_transfer = pd.read_csv(transfer_avg_path)

# Each file has columns: scan_id, probs_model0, target, ..., ensemble_prob, pred_label.
# We assume the "target" is identical across files.
# Merge on 'scan_id'
df_merged = df_external[['scan_id', 'target', 'ensemble_prob']].copy()
df_merged = df_merged.merge(df_scratch[['scan_id', 'ensemble_prob']], on='scan_id', how='inner', suffixes=('_external', '_scratch'))
df_merged = df_merged.merge(df_transfer[['scan_id', 'ensemble_prob']], on='scan_id', how='inner')
# After the second merge, the transfer column gets a suffix automatically, e.g., 'ensemble_prob' or 'ensemble_prob_transfer'.
if 'ensemble_prob_transfer' in df_merged.columns:
    df_merged.rename(columns={'ensemble_prob_transfer': 'ensemble_prob_transfer'}, inplace=True)
else:
    df_merged.rename(columns={'ensemble_prob': 'ensemble_prob_transfer'}, inplace=True)

# Now rename columns for clarity:
df_merged.rename(columns={
    'ensemble_prob_external': 'external_prob',
    'ensemble_prob_scratch': 'scratch_prob',
    'ensemble_prob_transfer': 'transfer_prob'
}, inplace=True)

print("Merged columns:", df_merged.columns.tolist())

# %% [code]
# Compute AUCs for each model ensemble
y_true = df_merged['target'].values
auc_external = roc_auc_score(y_true, df_merged['external_prob'].values)
auc_scratch  = roc_auc_score(y_true, df_merged['scratch_prob'].values)
auc_transfer = roc_auc_score(y_true, df_merged['transfer_prob'].values)

print("AUC External:", auc_external)
print("AUC Scratch:", auc_scratch)
print("AUC Transfer:", auc_transfer)

# %% [code]
# Compare External vs. Scratch using DeLong test
p_ext_scratch, auc_ext, auc_scratch_calc = delong_roc_test(y_true, 
                                                          df_merged['external_prob'].values,
                                                          df_merged['scratch_prob'].values)
print("\nDeLong test External vs. Scratch:")
print(f"AUC External: {auc_ext:.3f}, AUC Scratch: {auc_scratch_calc:.3f}, p-value: {p_ext_scratch:.4f}")

# Compare External vs. Transfer using DeLong test
p_ext_transfer, auc_ext, auc_transfer_calc = delong_roc_test(y_true, 
                                                            df_merged['external_prob'].values,
                                                            df_merged['transfer_prob'].values)
print("\nDeLong test External vs. Transfer:")
print(f"AUC External: {auc_ext:.3f}, AUC Transfer: {auc_transfer_calc:.3f}, p-value: {p_ext_transfer:.4f}")

# Compare Scratch vs. Transfer using DeLong test
p_scratch_transfer, auc_scratch_calc, auc_transfer_calc = delong_roc_test(y_true, 
                                                                        df_merged['scratch_prob'].values,
                                                                        df_merged['transfer_prob'].values)
print("\nDeLong test Scratch vs. Transfer:")
print(f"AUC Scratch: {auc_scratch_calc:.3f}, AUC Transfer: {auc_transfer_calc:.3f}, p-value: {p_scratch_transfer:.4f}")


Merged columns: ['scan_id', 'target', 'external_prob', 'scratch_prob', 'transfer_prob']
AUC External: 0.9056140350877192
AUC Scratch: 0.8760818713450292
AUC Transfer: 0.9397076023391813

DeLong test External vs. Scratch:
AUC External: 0.104, AUC Scratch: 0.104, p-value: 1.0000

DeLong test External vs. Transfer:
AUC External: 0.104, AUC Transfer: 0.104, p-value: 1.0000

DeLong test Scratch vs. Transfer:
AUC Scratch: 0.104, AUC Transfer: 0.104, p-value: 1.0000


In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from scipy.stats import norm

def delong_auc_variance(y_true, y_scores):
    # Separate positive and negative scores
    pos_scores = y_scores[y_true == 1]
    neg_scores = y_scores[y_true == 0]
    n1 = len(pos_scores)
    n2 = len(neg_scores)
    
    # Combine scores and compute average ranks
    all_scores = np.concatenate([pos_scores, neg_scores])
    ranks = pd.Series(all_scores).rank(method='average').values
    # Ranks for positive scores
    ranks_pos = ranks[:n1]
    
    auc = (np.sum(ranks_pos) - n1*(n1+1)/2) / (n1*n2)
    Q1 = auc / (2 - auc)
    Q2 = 2 * auc**2 / (1 + auc)
    var_auc = (auc * (1 - auc) + (n1 - 1)*(Q1 - auc**2) + (n2 - 1)*(Q2 - auc**2)) / (n1*n2)
    return auc, var_auc

def delong_roc_test(y_true, preds1, preds2):
    """
    Compare two sets of prediction scores using DeLong test.
    Returns p-value, auc1, auc2.
    """
    auc1, var1 = delong_auc_variance(y_true, preds1)
    auc2, var2 = delong_auc_variance(y_true, preds2)
    auc_diff = auc1 - auc2
    var_diff = var1 + var2  # Note: this assumes independence (simplification)
    z = np.abs(auc_diff) / np.sqrt(var_diff)
    p_value = 2 * (1 - norm.cdf(z))
    return p_value, auc1, auc2

# Compute AUCs via roc_auc_score (these are our reference values)
y_true = df_merged['target'].values
auc_external = roc_auc_score(y_true, df_merged['external_prob'].values)
auc_scratch  = roc_auc_score(y_true, df_merged['scratch_prob'].values)
auc_transfer = roc_auc_score(y_true, df_merged['transfer_prob'].values)

print("AUC External:", auc_external)
print("AUC Scratch:", auc_scratch)
print("AUC Transfer:", auc_transfer)

# Compare models using the corrected DeLong test:
p_ext_scratch, auc_ext, auc_scratch_calc = delong_roc_test(y_true, 
                                                          df_merged['external_prob'].values,
                                                          df_merged['scratch_prob'].values)
print("\nDeLong test External vs. Scratch:")
print(f"AUC External: {auc_ext:.3f}, AUC Scratch: {auc_scratch_calc:.3f}, p-value: {p_ext_scratch:.4f}")

p_ext_transfer, auc_ext, auc_transfer_calc = delong_roc_test(y_true, 
                                                            df_merged['external_prob'].values,
                                                            df_merged['transfer_prob'].values)
print("\nDeLong test External vs. Transfer:")
print(f"AUC External: {auc_ext:.3f}, AUC Transfer: {auc_transfer_calc:.3f}, p-value: {p_ext_transfer:.4f}")

p_scratch_transfer, auc_scratch_calc, auc_transfer_calc = delong_roc_test(y_true, 
                                                                        df_merged['scratch_prob'].values,
                                                                        df_merged['transfer_prob'].values)
print("\nDeLong test Scratch vs. Transfer:")
print(f"AUC Scratch: {auc_scratch_calc:.3f}, AUC Transfer: {auc_transfer_calc:.3f}, p-value: {p_scratch_transfer:.4f}")


AUC External: 0.9056140350877192
AUC Scratch: 0.8760818713450292
AUC Transfer: 0.9397076023391813

DeLong test External vs. Scratch:
AUC External: 0.906, AUC Scratch: 0.876, p-value: 0.4596

DeLong test External vs. Transfer:
AUC External: 0.906, AUC Transfer: 0.940, p-value: 0.3200

DeLong test Scratch vs. Transfer:
AUC Scratch: 0.876, AUC Transfer: 0.940, p-value: 0.0845


In summary, based on the DeLong test:

None of the differences are statistically significant.

Even the largest difference (between Scratch and Transfer) has a p-value of 0.0845, which suggests that while there may be a trend (with the transfer model performing better numerically), the evidence isn’t strong enough (at the 5% level) to confidently state that one model’s AUC is truly higher than the other’s.