File computing the baseline performance of centralized algorithms from: https://www.nature.com/articles/s41531-022-00288-w

In [None]:
from math import ceil

import pandas as pd

import numpy as np

import sys
import os
sys.path.append(os.path.abspath('..'))

In [None]:
from multi_modality_fl.utils.data_management import GlobalExperimentsConfiguration, write_json, read_json

current_experiment = GlobalExperimentsConfiguration(
    base_path=os.path.join(os.getcwd(), os.path.join('multi_modality_fl', 'experiments')),
    experiment_name='baselines',
    random_seed=0
)

current_experiment.initialize_data_splits(
    dataset_folder='/Users/benjamindanek/Code/federated_learning_multi_modality_ancestry/data',
    dataset=GlobalExperimentsConfiguration.MULTIMODALITY,
    split_method=GlobalExperimentsConfiguration.SKLEARN
)

In [None]:
# utils
import time
def utils_time_fn(fun, *args, **kwargs):
    """return (function run time (second), result of function call)"""
    start_time = time.perf_counter()
    
    result = fun(*args, **kwargs)

    end_time = time.perf_counter()
    run_time = end_time - start_time
    
    return (run_time, result)

def utils_sk_metrics_to_str(metrics_dict):
    """convert metrics dict to readable object"""
    rows = []
    for key, value in metrics_dict.items():
        if key == "algorithm":
            rows.append("{}: {}".format(key, value))
        elif key == "runtime_s":
            rows.append("{}: {:0.3f} seconds\n".format(key, value))
        else:
            rows.append("{}: {:0.4f}".format(key, value))
    return str.join("\n", rows)

In [None]:
# extracted GenoML
import xgboost
from sklearn import discriminant_analysis, ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn import neighbors
from sklearn import neural_network
from sklearn import svm

candidate_algorithms = [
    linear_model.LogisticRegression(solver='lbfgs'),
    ensemble.RandomForestClassifier(n_estimators=100),
    ensemble.AdaBoostClassifier(),
    ensemble.GradientBoostingClassifier(),
    linear_model.SGDClassifier(loss='modified_huber'),
    svm.SVC(probability=True, gamma='scale'),
    neural_network.MLPClassifier(),
    neighbors.KNeighborsClassifier(),
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),
    ensemble.BaggingClassifier(),
    xgboost.XGBClassifier(),
    xgboost.XGBRFClassifier()
]

algorithms = {algorithm.__class__.__name__: algorithm for algorithm in candidate_algorithms}
print("\n".join(algorithms.keys()))

def evaluate(competing_metrics, algorithm_name, algorithm, x, y):
    """evaluate how an algorithm does on the provided dataset & generate a pd row"""
    run_time, pred = utils_time_fn(algorithm.predict, x)
    metric_results = [metric_func(y, pred) for metric_func in competing_metrics]
    
    row = [algorithm_name, run_time] + metric_results # + [TN, FN, TP, FP, sensitivity, specificity, PPV, NPV]
    return row, pred

def process_results(column_names, results):
    log_table = pd.DataFrame(data=results, columns=column_names)
    best_id = log_table.explained_variance_score.idxmax()
    best_algorithm_name = log_table.iloc[best_id].algorithm
    best_algorithm = algorithms[best_algorithm_name]
    best_algorithm_metrics = log_table.iloc[best_id].to_dict()
    
    res = {
        'log_table': log_table,
        'best_id': best_id,
        'best_algorithm_name': best_algorithm_name,
        'best_algorithm': best_algorithm,
        'best_algorithm_metrics': best_algorithm_metrics,
    }
    
    return res

def compete(fold_idx, algorithms, x_train, y_train, x_test, y_test, x_addit_test=None, y_addit_test=None):
    """Compete the algorithms"""
    competing_metrics = [metrics.explained_variance_score, metrics.mean_squared_error,
                         metrics.median_absolute_error, metrics.r2_score, metrics.roc_auc_score,
                         metrics.average_precision_score]


    column_names = ["algorithm", "runtime_s"] + [metric.__name__ for metric in competing_metrics] # + ['TN', 'FN', 'TP', 'FP', 'sensitivity', 'specificity', 'PPV', 'NPV']

    results = []
    results_val = []
    for algorithm_name, algorithm in algorithms.items():

        algorithm.fit(x_train, y_train)
        
        row, y_pred = evaluate(competing_metrics, algorithm_name, algorithm, x_test, y_test)
        results.append(row)
        # current_experiment.add_val_result(fold_idx=fold_idx, algorithm_name=algorithm_name, num_clients=0, split_method='central', name='internal_val', y_true=y_test, y_pred=y_pred)
        current_experiment.add_to_kfold_table(algorithm_name=algorithm_name, num_clients=0, split_method='central', val_name='internal test', y_true=y_test, y_pred=y_pred)

        row, y_addit_pred = evaluate(competing_metrics, algorithm_name, algorithm, x_addit_test, y_addit_test)
        results_val.append(row)
        # current_experiment.add_val_result(fold_idx=fold_idx, algorithm_name=algorithm_name, num_clients=0, split_method='central', name='eternal_val', y_true=y_addit_test, y_pred=y_addit_pred)
        current_experiment.add_to_kfold_table(algorithm_name=algorithm_name, num_clients=0, split_method='central', val_name='external test', y_true=y_addit_test, y_pred=y_addit_pred)
        
    res = process_results(column_names, results)
    results_val = process_results(column_names, results_val)
    
    return res, results_val

def get_split(dataset, splits):
    indeces = list(range(0, len(dataset)))
    np.random.shuffle(indeces)
    subsets = []
    for portion in splits:
        offset = 0
        if (subsets):
            offset = len(subsets[-1])

        indeces_partition = ceil(len(dataset) * portion)
        subset = dataset[offset: min(offset + indeces_partition, len(dataset))]
        subset = subset.reset_index(drop=True)
        # print(f"split {portion} - len: {len(subset)} actual: {len(subset) / len(dataset)}")
        # display(subset)
        subsets.append(subset)

    return subsets

In [None]:
import collections
kfold_results = collections.defaultdict(lambda: collections.defaultdict(list))
for fold_idx in range(current_experiment.K):
    current_experiment.set_fold(fold_idx=fold_idx)

    # get processed datasets
    train = current_experiment.training_dataset
    internal, external = current_experiment.get_combined_test_dataset()
    test = internal[1]
    addit_test = external[1]
    
    # separate predictors
    x_train, y_train = current_experiment.as_features_labels(train, current_experiment.LABEL_COL)
    x_test, y_test = current_experiment.as_features_labels(test, current_experiment.LABEL_COL)
    x_external, y_external = current_experiment.as_features_labels(addit_test, current_experiment.LABEL_COL)
    

    result, result_val = compete(fold_idx, algorithms, x_train, y_train, x_test, y_test, x_external, y_external)

In [None]:
display(current_experiment.kfold_table['algorithm_name'].value_counts())

In [None]:
internal_only = current_experiment.kfold_table[current_experiment.kfold_table['val_name'] == 'internal test'].groupby(current_experiment.metadata_column_names)
display(internal_only.mean())
exteral_only = current_experiment.kfold_table[current_experiment.kfold_table['val_name'] == 'external test'].groupby(current_experiment.metadata_column_names)
display(exteral_only.mean())
current_experiment.write_results('/Users/benjamindanek/Code/federated_learning_multi_modality_ancestry/multi_modality_fl/results/dataframes')

In [None]:
pd.read_csv('/Users/benjamindanek/Code/federated_learning_multi_modality_ancestry/multi_modality_fl/results/dataframes/baselines.csv')[current_experiment.kfold_table['val_name'] == 'external_val'].groupby(current_experiment.metadata_column_names).mean()