In [1]:
import collections
import pandas as pd
import os
from sklearn import metrics
import numpy as np

# REPLACE WITH YOUR OWN RESULTS DIRECTORY
# the source of the delongs test
RESULTS_DIRECTORY = '/home/bdanek2/multi-omics-pdd-FL-study/federated_learning_multi_modality_ancestry/multi_modality_fl/results/experimental_results_dir/repro_test'

def process_predictions(results_df: pd.DataFrame):
    groups = results_df.groupby(['val_name', 'algorithm_name', 'split_method', 'num_clients', 'fold_idx'], group_keys=True)
    reconstructed_results = []
    for (val_name, algorithm_name, split_method, num_clients, fold_idx), group in groups:
        if num_clients != 0 and num_clients != 2:
            continue

        row = {
            'algorithm_name': algorithm_name,
            'val_name': val_name,
            'fold_idx': fold_idx,
            'y_true': list(group['y_true']),
            'y_pred': list(group['y_pred'])
        }

        reconstructed_results.append(row)

    return reconstructed_results

internal_fed_reconstructed_results = []
internal_cent_reconstructed_results = []

external_fed_reconstructed_results = []
external_cent_reconstructed_results = []

best_fold_internal_cent = collections.defaultdict(dict)
best_fold_internal_fed = collections.defaultdict(dict)

best_fold_external_cent = collections.defaultdict(dict)
best_fold_external_fed = collections.defaultdict(dict)

all_results_internal = collections.defaultdict(pd.DataFrame)
all_results_external = collections.defaultdict(pd.DataFrame)

for dirpath, dirname, filenames in list(os.walk(RESULTS_DIRECTORY)):

    for filename in filenames:
        if ".csv" not in filename:
            continue
        p = os.path.join(dirpath, filename)
        df = pd.read_csv(p)
        
        # all_dfs[filenames] = df
        for (val_name, algorithm_name, split_method, num_clients), group in df.groupby(['val_name', 'algorithm_name', 'split_method', 'num_clients'], group_keys=True):
            if num_clients != 0 and num_clients != 2:
                continue

            auc = metrics.roc_auc_score(group['y_true'], group['y_pred'])
            
            # we need the fold idx of the best performing algorithm in the task
            # the fold idx can be different for internal and external test sets
            if val_name == 'internal test':
                if 'fed' in algorithm_name.lower():
                    if (algorithm_name not in best_fold_internal_fed or auc > best_fold_internal_fed[algorithm_name]['score']):
                        best_fold_internal_fed[algorithm_name]['score'] = auc
                        best_fold_internal_fed[algorithm_name]['fold'] = group['fold_idx'].iloc[0]
                        best_fold_internal_fed[algorithm_name]['algorithm_name'] = algorithm_name

                    internal_fed_reconstructed_results.append(group)

                else:
                    if (algorithm_name not in best_fold_internal_cent or auc > best_fold_internal_cent[algorithm_name]['score']):
                        best_fold_internal_cent[algorithm_name]['score'] = auc
                        best_fold_internal_cent[algorithm_name]['fold'] = group['fold_idx'].iloc[0]
                        best_fold_internal_cent[algorithm_name]['algorithm_name'] = algorithm_name
                    internal_cent_reconstructed_results.append(group)

                # aggregate results
                all_results_internal[group['fold_idx'].iloc[0]] = pd.concat([all_results_internal[group['fold_idx'].iloc[0]], group])

            elif val_name == 'external test':
                if 'fed' in algorithm_name.lower():
                    if (algorithm_name not in best_fold_external_fed or auc > best_fold_external_fed[algorithm_name]['score']):
                        best_fold_external_fed[algorithm_name]['score'] = auc
                        best_fold_external_fed[algorithm_name]['fold'] = group['fold_idx'].iloc[0]
                        best_fold_external_fed[algorithm_name]['algorithm_name'] = algorithm_name
                    
                    external_fed_reconstructed_results.append(group)
                else:
                    if (algorithm_name not in best_fold_external_cent or auc > best_fold_external_cent[algorithm_name]['score']):
                        best_fold_external_cent[algorithm_name]['score'] = auc
                        best_fold_external_cent[algorithm_name]['fold'] = group['fold_idx'].iloc[0]
                        best_fold_external_cent[algorithm_name]['algorithm_name'] = algorithm_name

                    external_cent_reconstructed_results.append(group)

                # aggregate results
                all_results_external[group['fold_idx'].iloc[0]] = pd.concat([all_results_external[group['fold_idx'].iloc[0]], group])

            else:
                raise ValueError('val_name not recognized')
            

display(best_fold_internal_cent)
display(best_fold_internal_fed)
display(best_fold_external_cent)
display(best_fold_external_fed)
all_results_external[best_fold_external_cent['MLPClassifier_0.1']['fold']]


defaultdict(dict,
            {'AdaBoostClassifier': {'score': 0.9114688128772636,
              'fold': 1,
              'algorithm_name': 'AdaBoostClassifier'},
             'BaggingClassifier': {'score': 0.9180080482897385,
              'fold': 0,
              'algorithm_name': 'BaggingClassifier'},
             'GradientBoostingClassifier': {'score': 0.9255533199195171,
              'fold': 4,
              'algorithm_name': 'GradientBoostingClassifier'},
             'KNeighborsClassifier': {'score': 0.7660965794768612,
              'fold': 4,
              'algorithm_name': 'KNeighborsClassifier'},
             'LinearDiscriminantAnalysis': {'score': 0.8209255533199196,
              'fold': 4,
              'algorithm_name': 'LinearDiscriminantAnalysis'},
             'LogisticRegression': {'score': 0.9491951710261569,
              'fold': 4,
              'algorithm_name': 'LogisticRegression'},
             'MLPClassifier_0.1': {'score': 0.8797786720321932,
              

defaultdict(dict,
            {'FedAvg LRClassifier': {'score': 0.9456740442655935,
              'fold': 4,
              'algorithm_name': 'FedAvg LRClassifier'},
             'FedAvg MLPClassifier': {'score': 0.9270623742454729,
              'fold': 4,
              'algorithm_name': 'FedAvg MLPClassifier'},
             'FedAvg SGDClassifier': {'score': 0.9502012072434608,
              'fold': 4,
              'algorithm_name': 'FedAvg SGDClassifier'},
             'FedAvg XGBRFClassifier': {'score': 0.8853118712273641,
              'fold': 0,
              'algorithm_name': 'FedAvg XGBRFClassifier'},
             'FedProx μ = 0 LRClassifier': {'score': 0.8993963782696177,
              'fold': 1,
              'algorithm_name': 'FedProx μ = 0 LRClassifier'},
             'FedProx μ = 0 MLPClassifier': {'score': 0.9124748490945673,
              'fold': 4,
              'algorithm_name': 'FedProx μ = 0 MLPClassifier'},
             'FedProx μ = 2 LRClassifier': {'score': 0.93460

defaultdict(dict,
            {'AdaBoostClassifier': {'score': 0.8672022750027812,
              'fold': 1,
              'algorithm_name': 'AdaBoostClassifier'},
             'BaggingClassifier': {'score': 0.8235986344420959,
              'fold': 0,
              'algorithm_name': 'BaggingClassifier'},
             'GradientBoostingClassifier': {'score': 0.8750451941261543,
              'fold': 0,
              'algorithm_name': 'GradientBoostingClassifier'},
             'KNeighborsClassifier': {'score': 0.6087440204694627,
              'fold': 5,
              'algorithm_name': 'KNeighborsClassifier'},
             'LinearDiscriminantAnalysis': {'score': 0.7186457058627211,
              'fold': 0,
              'algorithm_name': 'LinearDiscriminantAnalysis'},
             'LogisticRegression': {'score': 0.7929135610190233,
              'fold': 5,
              'algorithm_name': 'LogisticRegression'},
             'MLPClassifier_0.1': {'score': 0.6937767688285683,
              

defaultdict(dict,
            {'FedAvg LRClassifier': {'score': 0.7735148514851486,
              'fold': 4,
              'algorithm_name': 'FedAvg LRClassifier'},
             'FedAvg MLPClassifier': {'score': 0.7326072143731227,
              'fold': 5,
              'algorithm_name': 'FedAvg MLPClassifier'},
             'FedAvg SGDClassifier': {'score': 0.7900176604739125,
              'fold': 2,
              'algorithm_name': 'FedAvg SGDClassifier'},
             'FedAvg XGBRFClassifier': {'score': 0.8150187034152854,
              'fold': 5,
              'algorithm_name': 'FedAvg XGBRFClassifier'},
             'FedProx μ = 0 LRClassifier': {'score': 0.7775718934252975,
              'fold': 1,
              'algorithm_name': 'FedProx μ = 0 LRClassifier'},
             'FedProx μ = 0 MLPClassifier': {'score': 0.7270448603849149,
              'fold': 5,
              'algorithm_name': 'FedProx μ = 0 MLPClassifier'},
             'FedProx μ = 2 LRClassifier': {'score': 0.77126

Unnamed: 0,fold_idx,algorithm_name,num_clients,split_method,stratified,val_name,num_samples,y_true,y_pred,num_rounds,num_local_rounds,client_lr,proximal_mu
2232,2,AdaBoostClassifier,0,central,False,external test,1116,1,0.555478,-1,-1,-1.0,-1.0
2233,2,AdaBoostClassifier,0,central,False,external test,1116,0,0.522715,-1,-1,-1.0,-1.0
2234,2,AdaBoostClassifier,0,central,False,external test,1116,1,0.591123,-1,-1,-1.0,-1.0
2235,2,AdaBoostClassifier,0,central,False,external test,1116,1,0.490152,-1,-1,-1.0,-1.0
2236,2,AdaBoostClassifier,0,central,False,external test,1116,1,0.577046,-1,-1,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14503,2,XGBRFClassifier,0,central,False,external test,1116,1,0.844912,-1,-1,-1.0,-1.0
14504,2,XGBRFClassifier,0,central,False,external test,1116,1,0.842976,-1,-1,-1.0,-1.0
14505,2,XGBRFClassifier,0,central,False,external test,1116,1,0.854334,-1,-1,-1.0,-1.0
14506,2,XGBRFClassifier,0,central,False,external test,1116,1,0.313185,-1,-1,-1.0,-1.0


In [2]:
"""
Delong's test implementation copied from: https://github.com/yandexdataschool/roc_comparison/blob/master/compare_auc_delong_xu.py
"""

import pandas as pd
import numpy as np
import scipy.stats

# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(x):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5*(i + j - 1)
        i = j
    T2 = np.empty(N, dtype=float)
    # Note(kazeevn) +1 is due to Python using 0-based indexing
    # instead of 1-based in the AUC formula in the paper
    T2[J] = T + 1
    return T2


def fastDeLong(predictions_sorted_transposed, label_1_count):
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Operating Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=float)
    ty = np.empty([k, n], dtype=float)
    tz = np.empty([k, m + n], dtype=float)
    for r in range(k):
        tx[r, :] = compute_midrank(positive_examples[r, :])
        ty[r, :] = compute_midrank(negative_examples[r, :])
        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
    v01 = (tz[:, :m] - tx[:, :]) / n
    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov


def calc_pvalue(aucs, sigma):
    """Computes log(10) of p-values.
    Args:
       aucs: 1D array of AUCs
       sigma: AUC DeLong covariances
    Returns:
       log10(pvalue)
    """
    l = np.array([[1, -1]])
    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
    return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)


def compute_ground_truth_statistics(ground_truth):
    assert np.array_equal(np.unique(ground_truth), [0, 1])
    order = (-ground_truth).argsort()
    label_1_count = int(ground_truth.sum())
    return order, label_1_count


def delong_roc_variance(ground_truth, predictions):
    """
    Computes ROC AUC variance for a single set of predictions
    Args:
       ground_truth: np.array of 0 and 1
       predictions: np.array of floats of the probability of being class 1
    """
    order, label_1_count = compute_ground_truth_statistics(ground_truth)
    predictions_sorted_transposed = predictions[np.newaxis, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
    return aucs[0], delongcov


def delong_roc_test(ground_truth, predictions_one, predictions_two):
    """
    Computes log(p-value) for hypothesis that two ROC AUCs are different
    Args:
       ground_truth: np.array of 0 and 1
       predictions_one: predictions of the first model,
          np.array of floats of the probability of being class 1
       predictions_two: predictions of the second model,
          np.array of floats of the probability of being class 1
    """
    order, label_1_count = compute_ground_truth_statistics(ground_truth)
    predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
    return calc_pvalue(aucs, delongcov)

In [5]:
# internal_cent = all_results_internal[best_fold_internal_cent['fold']]
# internal_fed = all_results_internal[best_fold_internal_fed['fold']]
# cent_dataset = internal_cent
# fed_dataset = internal_fed

def sort_name(name):
    if "μ" in name:
        loc = name.find("Classifier")
        return "avg" if "Avg" in name else "prox" + name[loc-3:]
    else:
        return name
        
def statistical_signifcance_table(test_subset, best_folds_cent, best_folds_fed, save_path):
    """
    Compute a table of statistical significance between central and federated algorithms using Delong's test
    """

    cent_algs_to_evaluate = [
        'LogisticRegression',
        'SGDClassifier',
        'MLPClassifier_0.1',
        'XGBRFClassifier_10'
    ]
    
    fed_algs_to_evaluate = [
        'FedAvg SGDClassifier',
        'FedAvg XGBRFClassifier',
        'FedAvg LRClassifier',
        'FedAvg MLPClassifier',
        'FedProx μ = 0 LRClassifier',
        'FedProx μ = 0 MLPClassifier',
        'FedProx μ = 2 LRClassifier',
        'FedProx μ = 2 MLPClassifier'
    ]
    
    results = []
    for central_alg in sorted(cent_algs_to_evaluate, key=sort_name):

        row = { 'central_alg': central_alg }
        for fed_alg in sorted(fed_algs_to_evaluate, key=sort_name):

            
            # get the optimal model for this algorithm
            cent_dataset = test_subset[best_folds_cent[central_alg]['fold']]
            fed_dataset = test_subset[best_folds_fed[fed_alg]['fold']]

            # use the optimal model as the basis for the delong's evaluation
            cent_results = cent_dataset[cent_dataset['algorithm_name'] == central_alg]
            fed_results = fed_dataset[fed_dataset['algorithm_name'] == fed_alg]

            cent = cent_results['y_pred'].apply(lambda x: np.array(x))
            fed = fed_results['y_pred'].apply(lambda x: np.array(x))

            ground_truth = cent_results['y_true']
            
            # display(cent_results['y_true'].to_list())
            # display(fed_results['y_true'].to_list())
            print(cent_results['y_true'].astype(int).to_list())
            print(fed_results['y_true'].astype(int).to_list())

            assert cent_results['y_true'].astype(int).to_list() == fed_results['y_true'].astype(int).to_list()

            p_value_log10 = delong_roc_test(cent_results['y_true'], cent_results['y_pred'], fed_results['y_pred'])
            p_value = 10 ** p_value_log10[0][0] # unpack and convert to normal scale
            
            cent_auc = metrics.roc_auc_score(cent_results['y_true'], cent_results['y_pred'])
            fed_auc = metrics.roc_auc_score(fed_results['y_true'], fed_results['y_pred'])

            cell_str = 'greater' if cent_auc > fed_auc else 'lesser'
            
            if (p_value < 0.05):
                print(central_alg, fed_alg, p_value)
                cell_str += '*'
            else:
                print(central_alg, fed_alg, p_value, 'not significant')

            row[fed_alg] = cell_str
        results.append(row)

    table = pd.DataFrame.from_records(results)
    table.to_csv(save_path)
    return table

In [6]:
# external_cent = all_results_external[best_fold_external_cent['fold']]
# external_fed = all_results_external[best_fold_external_fed['fold']]
statistical_signifcance_table(all_results_external, best_fold_external_cent, best_fold_external_fed, '/Users/benjamindanek/Downloads/cell_patterns_fl_misc/revision/external_test_stat_sig.csv')

[1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 

AssertionError: 