In [None]:
!pip install --quiet --upgrade tensorflow-federated

In [None]:
%load_ext tensorboard

In [None]:
import collections

import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
from tensorflow_federated.python.simulation.datasets import from_tensor_slices_client_data

np.random.seed(0)

tff.federated_computation(lambda: 'Hello, World!')()

In [None]:
from functools import lru_cache
import json
import os
import random
from typing import Any, Callable, Dict, List, Tuple, Union
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import average_precision_score
from sklearn.model_selection import KFold, train_test_split
import torch
import collections
import h5py

# utility functions
def build_full_path(base_path, subset=None):
    assert subset != None, "Must provide subset"
    return os.path.join(base_path, subset)

def write_json(data: Any, path: str):
    """Dump json file at path with indent=4"""
    with open(path, "w") as f:
        json.dump(data, f, indent=4)

def read_json(path: str):
    if not os.path.isfile(path):
        raise ValueError(f"{path} does not exist!")
    with open(path, "r") as f:
        return json.load(f)

def get_h5_data_keys(file_path):
    f = h5py.File(file_path, 'r')
    keys = list(f.keys())
    f.close()
    return keys

# used for safe displaying of data in ipynb cells
# used in exploration and baselines file
def drop_id(dataset: pd.DataFrame):
    return dataset.drop(columns=['ID'])

class GlobalExperimentsConfiguration:

    num_rounds = 20

    RANDOM_SEED = 1

    K = 6

    STANDARD = 'standard'
    SKLEARN = 'sklearn'

    MULTIMODALITY = 'multi_modality'
    MULTIMODALITY_LABEL_COL = 'PHENO'
    MULTIMODALITY_DATASET_FILES = {
        'clinicodemogrpahic_ppmi': 'Clinicodemographic/PPMI_Only_clinical.dataForML.h5',
        'transcriptomics_ppmi': 'TRANSCRIPTOMICS_p1E2/PPMI_Only_transcriptomics_only-p1E2.dataForML.h5',
        'genetics_ppmi': 'GENETICS_p1E5/PPMI_Only_genetics_p1E5_with_PRS-MAF05.dataForML.h5',

        'combined_ppmi': 'Combined_G1E5_O1E2/PPMI-genetic_p1E5_omic_p1E2.dataForML.h5',

        'validation_pdbp': 'Validation/validate-PDBP-genetic_p1E5_omic_p1E2.dataForML.h5'
    }

    MULTIANCESTRY = 'multi_ancestry'
    MULTIANCESTRY_DATASET_FOLDER = ''
    MULTIANCESTRY_DATASET_FILES = {}

    # used for outputting tables
    metadata_column_names = ['algorithm_name', 'num_clients', 'split_method', 'val_name']

    def __init__(self, base_path: str, experiment_name: str, random_seed=None):
        if random_seed:
            self.RANDOM_SEED = random_seed
        np.random.seed(random_seed)
        random.seed(random_seed)
        torch.manual_seed(random_seed)

        self.experiment_results = {}
        self.experiment_name = experiment_name

        # the path which will contain all intermediate experiment files
        self.experiment_path = os.path.join(base_path, experiment_name)
        os.makedirs(self.experiment_path, exist_ok=True)

        self.plots_path = os.path.join(self.experiment_path, 'plots')
        os.makedirs(self.plots_path, exist_ok=True)

        # avoid duplicating external_val dataset
        self.external_val_recorded = False

    # @lru_cache(maxsize=10)
    def _get_raw_dataset(self, path: str, drop: str):
        """load the dataset so you can get its size. @lru_cache decorated for speed."""
        assert '.h5' in path, 'Dataset path does not have a .h5 extension.'
        keys = get_h5_data_keys(path)
        loaded_dataset = pd.read_hdf(path, key=keys[0])
        loaded_dataset = loaded_dataset.drop(columns=['ID'])
        return loaded_dataset

    def _standardize_for_validation(self, dataset_1: pd.DataFrame, dataset_2: pd.DataFrame):
        """use the subset of features (columns) which are present in both datasets"""

        shared_dataset_columns = list(set(dataset_1.columns) & set(dataset_2.columns)) # set intersection
        print("shared columns", list(shared_dataset_columns))

        not_included = (set(dataset_1.columns) | set(dataset_2.columns)) - (set(dataset_1.columns) & set(dataset_2.columns))
        print("non-shared columns", len(not_included), list(not_included))

        print(f"shape BEFORE standardization \n{dataset_1.shape} \n {dataset_2.shape}")
        dataset_1 = (dataset_1[shared_dataset_columns])
        dataset_2 = (dataset_2[shared_dataset_columns])
        print(f"shape AFTER standardization \n{dataset_1.shape} \n {dataset_2.shape}")

        return dataset_1, dataset_2

    def as_features_labels(self, dataset: pd.DataFrame, label_col: str):
        """make (feature, label) pairs, where `label_col` represents the label col and all others are features.
        Normalize the samples to have balanced value counts if `normalize` is true."""

        features = dataset.drop(columns=[label_col]).copy().to_numpy()
        labels = pd.DataFrame(dataset[label_col].copy()).to_numpy().reshape((-1, ))
        return features, labels

    def create_experiment(self, dataset_folder: str, dataset: str, split_method: str = STANDARD):
        assert split_method == self.STANDARD or split_method == self.SKLEARN, f'Unsupported split_method provided. Recieved {split_method}'

        if dataset == self.MULTIMODALITY:

            self.LABEL_COL = self.MULTIMODALITY_LABEL_COL

            # 0. get the dataset sources
            # self.INTERNAL_DATASET = os.path.join(dataset_folder, self.MULTIMODALITY_DATASET_FILES['validation_pdbp'])
            self.INTERNAL_DATASET = os.path.join(dataset_folder, self.MULTIMODALITY_DATASET_FILES['combined_ppmi'])

            # a secondary dataset only containing validation data for "extenral validation" (on a dataset from a different distribution as the internal dataset)
            # self.EXTERNAL_DATASET = os.path.join(dataset_folder, self.MULTIMODALITY_DATASET_FILES['combined_ppmi'])
            self.EXTERNAL_DATASET = os.path.join(dataset_folder, self.MULTIMODALITY_DATASET_FILES['validation_pdbp'])

            # 1. load the dataset from raw file format
            full_internal_dataset = self._get_raw_dataset(self.INTERNAL_DATASET, drop='ID')
            full_external_dataset = self._get_raw_dataset(self.EXTERNAL_DATASET, drop='ID')
            print("internal: ", full_internal_dataset.shape)
            print("external: ", full_external_dataset.shape)

            # 2. normalize the feature space the datasets
            # Use the subset of features which are shared between the internal and external dataset
            # ppmi has 675 columns, but the combined pdbp dataset has 715. Drop the 40 extra columns from pdbp
            self.full_internal_dataset, self.full_external_dataset = self._standardize_for_validation(full_internal_dataset, full_external_dataset)
            print("full internal", self.full_internal_dataset.info())
            print("full external", self.full_external_dataset.info())
            # 3. compute k folds for the internal dataset

            self._generate_stratified_k_folds(self.full_internal_dataset)
            self.full_external_dataset = self.full_external_dataset.sample(frac=1).reset_index(drop=True)

        elif dataset == self.MULTIANCESTRY:
            assert False, "multi ancestry not implemented yet"

        else:
            assert False, f"Unsupported dataset type provided; received {dataset}"

        return self

    def _generate_stratified_k_folds(self, df: pd.DataFrame):
        """Generate k folds of the dataset and store them in the class variable `self.k_fold_indeces`"""
        k_fold_indeces: Dict[int, pd.DataFrame] = dict()

        # shuffle the dataframe (should be the only time in the whole experiment we do this.)
        df = df.sample(frac=1, replace=False, random_state=self.RANDOM_SEED)

        for _, group in df.groupby('PHENO'):

            fold_len = len(group) // self.K
            start = 0
            for fold in range(0, self.K):
                end = start + fold_len if fold != self.K - 1 else len(group)

                fold_data = group.iloc[start:end]
                if fold not in k_fold_indeces:
                    k_fold_indeces[fold] = fold_data
                else:
                    k_fold_indeces[fold] = pd.concat([k_fold_indeces[fold], fold_data])

                start = end

        # sanity check, since this is such a crucial part of the experimental design
        for i, subset_i in k_fold_indeces.items():
            for j, subset_j in k_fold_indeces.items():
                if i == j: continue
                assert set(subset_i.index) & set(subset_j.index) == set(), "folds have overlapping indeces"

        # all partitions must have approximately similar startification
        stratifications_across_folds = [fold_values['PHENO'].value_counts()[0] / fold_values['PHENO'].value_counts()[1] for fold_values in k_fold_indeces.values()]
        assert np.std(stratifications_across_folds) < 0.03, f"folds do not have balanced startification: {stratifications_across_folds}; std: {np.std(stratifications_across_folds)}"

        self.k_fold_indeces = k_fold_indeces

    def set_fold(self, fold_idx: int):
        """Use the provided `fold_idx` as the holdout dataset, use the rest in training

        Args:
            fold_idx: the fold which is the holdout dataset

        Returns:
            holdout_dataset, training_dataset
        """

        holdout_idx = fold_idx
        holdout_dataset = self.k_fold_indeces[holdout_idx]

        trainig_folds = []
        for fold_idx, fold in self.k_fold_indeces.items():
            if fold_idx != holdout_idx:
                trainig_folds.append(fold)

        training_dataset = pd.concat(trainig_folds)

        self.training_dataset = training_dataset
        self.internal_test_dataset = holdout_dataset

    def set_validation_dataset(self, ratios = [0.8, 0.2]):
        """Splits the current training dataset by the ratios, setting `self.training_dataset` to the first split, and `self.validation_dataset` to the second"""
        new_datasets = self.stratified_split(df=self.training_dataset, column='PHENO', ratios=ratios)
        assert len(new_datasets) == 2, f"Validaiton splitting failed; expected 2 new datasets, got {len(new_datasets)}"

        self.training_dataset, self.validation_dataset = new_datasets[0], new_datasets[1]

    # def _generate_k_fold_indeces(self, dataset: pd.DataFrame, k: int):
    #     """generate the indeces for the train/test split and store them in class instance variables"""
    #     kf = KFold(n_splits=k, shuffle=True, random_state=self.RANDOM_SEED)
    #     self.train_fold_indices = []
    #     self.test_fold_indeces = []
    #     self.val_fold_indices = []
    #     for train_index, val_index in kf.split(dataset):
    #         # we want test set to be 10% of overall dataset
    #         # 0.8 * x = 0.1 * 1
    #         # x = 0.1/0.8
    #         train_test = train_test_split(train_index, test_size=0.125, random_state=self.RANDOM_SEED)
    #         train_index, test_index = train_test[0], train_test[1]
    #         self.train_fold_indices.append(train_index)
    #         self.test_fold_indeces.append(test_index)
    #         self.val_fold_indices.append(val_index)

    #         print("set indeces", set(train_index) & set(val_index))

    # def set_train_dataset(self, fold_idx: int):
    #     """set the class instance variabel `training_dataset` to the training subset for the provided fold"""
    #     self.training_dataset = self.full_internal_dataset.iloc[self.train_fold_indices[fold_idx]].reset_index(drop=True)
    #     self.test_dataset = self.full_internal_dataset.iloc[self.test_fold_indeces[fold_idx]].reset_index(drop=True)

    def get_combined_test_dataset(self):
        return [
            ("internal test", self.internal_test_dataset.reset_index(drop=True)),
            ("external test", self.full_external_dataset.reset_index(drop=True))
        ]


    # depracated
    # def _split_dataframe(self, dataset: pd.DataFrame, ratios: List[int], shuffle: bool, as_intervals: bool) ->  List[Union[pd.DataFrame, Tuple[int]]]:
    #     """
    #     Split the internal dataset by ratios & handle shuffling.
    #     Returns either indeces of the dataset splits, or the dataset subsets depending on parameter `as_intervals`.
    #     If `as_intervals` is set, one cannot shuffle the dataset, because it would be redundant and is probably a mistake on the programmers part.
    #     """

    #     if shuffle:
    #         assert shuffle, "shuffle depracted"
    #         # assert as_intervals == False, 'There is no need to shuffle the df if we are just returning indeces.'
    #         # dataset = dataset.sample(frac=1).reset_index(drop=True)

    #     indeces = [0]
    #     for i, ratio in enumerate(ratios):

    #         last_split = (i == len(ratios) - 1)
    #         next_index = indeces[-1] + int(ratio * len(dataset)) if not last_split else len(dataset)
    #         indeces.append(next_index)

    #     # make sure we don't incorrectly calculate the splits for some reason
    #     assert sum([indeces[i+1] - indeces[i] for i in range(0, len(indeces) - 1)]) == len(dataset), f"Dataset splits do not correctly split the dataset. Expected {len(dataset)} received {sum([indeces[i+1] - indeces[i] for i in range(0, len(indeces) - 1)])}; received ratios {ratios}"

    #     result = []
    #     for i in range(0, len(indeces) - 1):
    #         start, end = indeces[i], indeces[i + 1]

    #         if as_intervals:
    #             result.append((start, end))
    #         else:
    #             result.append(dataset[start: end])

    #     return result

    def get_stratified_client_subsets(self, dataset: pd.DataFrame, num_clients: int, method: str):
        ratios = self.method_to_ratios(method=method, num_clients=num_clients)
        return self.stratified_split(df=dataset, column=self.MULTIMODALITY_LABEL_COL, ratios=ratios)

    def stratified_split(self, df: pd.DataFrame, column: str, ratios: float) -> List[pd.DataFrame]:
        samples = []
        groups = df.groupby(column)
        for _, group in groups:
            shuffled_group = group.sample(frac=1, replace=False, random_state=self.RANDOM_SEED)
            n = len(shuffled_group)
            offset = 0

            indeces = []
            for ri, ratio in enumerate(ratios):

                start = offset
                end = start + int(n * ratio) if ri != len(ratios) - 1 else n
                offset = end

                indeces.append((start, end))

            print(_, indeces)

            for i, (start, end) in enumerate(indeces):
                stratified_subset = group.iloc[start: end]

                if len(samples) == len(ratios):
                    samples[i].append(stratified_subset)
                else:
                    samples.append([stratified_subset])

        flattened = []
        for sample in samples:
            flattened.append(pd.concat([group for group in sample]))

        value_proportions = [subset['PHENO'].value_counts()[0] / subset['PHENO'].value_counts()[1] for subset in flattened]
        print(value_proportions)
        assert np.std(value_proportions) < 0.03, f"Value counts of stratified dataset inconsistnet. {value_proportions}"

        return flattened

    def method_to_ratios(self, method: str, num_clients: int):
        assert method in ['uniform', 'linear', 'polynomial', 'exponential'], f'Unsupported method specified for client splits. Recieved {method}'

        if method == 'uniform': # 1
            ratio_vec = np.ones(num_clients)
        elif method == 'linear': # n
            ratio_vec = np.linspace(1, num_clients, num=num_clients)
        elif method == 'polynomial': # n^2
            ratio_vec = np.square(np.linspace(1, num_clients, num=num_clients))
        elif method == 'exponential': # e^n
            ratio_vec = np.exp(np.linspace(1, num_clients, num=num_clients))

        total = sum(ratio_vec)
        ratios = ratio_vec / total
        return ratios

    # def get_client_splits(self, dataset: pd.DataFrame, num_clients: int, method: str, as_intervals=True):
    #     """returns the indeces of the splits on the dataframe. Does not shuffle the dataframe."""
    #     ratios = self.method_to_ratios(method=method, num_clients=num_clients)
    #     intervals = self._split_dataframe(dataset, ratios=ratios, shuffle=False, as_intervals=as_intervals)
    #     return intervals

    def nvflare_multi_site_split_json(
        self,
        data_source_path: List[str],
        validation_data_source_path: List[str],
        client_splits: List[Tuple[int]],
        site_naming_fn: Callable[..., str],
        site_config_naming_fn: Callable[..., str],
    ) -> List[Tuple[str, dict]]:
        """build the json for client splits for a single nvflare simulation job provided splits"""

        assert len(data_source_path) == len(client_splits), "Each client doesnt have its own data path."

        result_files, result_json = [], []
        for index, client_split in enumerate(client_splits):

            assert len(client_split) == 2, f'Malformed client split. Received len {len(client_split)}'
            start, end = client_split

            json_data = {
                "data_path": data_source_path[index],
                "data_index": {
                    site_naming_fn(index): { # f"{site_prefix}{site_idx + 1}"
                        "start": start,
                        "end": end
                    }
                },
                "valid_path": validation_data_source_path[index]
            }

            site_file_name = site_config_naming_fn(index)

            result_files.append(site_file_name)
            result_json.append(json_data)

        print("resulting files configured", result_files)
        return result_files, result_json

    def compute_metrics(self, y_true, y_pred):
        return {
            'accuracy': metrics.accuracy_score(y_true=y_true, y_pred=y_pred),
            'roc_auc_score': metrics.roc_auc_score(y_true=y_true, y_score=y_pred),
            'average_precision_score': metrics.average_precision_score(y_true=y_true, y_score=y_pred),
            'f0.5': metrics.fbeta_score(y_true=y_true, y_pred=y_pred, beta=0.5),
            'f1': metrics.fbeta_score(y_true=y_true, y_pred=y_pred, beta=1),
            'f2': metrics.fbeta_score(y_true=y_true, y_pred=y_pred, beta=2),
            'log_loss': metrics.log_loss(y_true=y_true, y_pred=y_pred),
            'matthews_corrcoef': metrics.matthews_corrcoef(y_true=y_true, y_pred=y_pred)
            # 'num_samples': len(y_true)
        }

    def add_val_result(self, fold_idx: int, algorithm_name: str, num_clients: str, split_method: str, name: str, y_true, y_pred):

        key = (algorithm_name, fold_idx, split_method, num_clients, name)
        assert key not in self.experiment_results, f'This result has already been logged. Current results are {list(self.experiment_results.keys())}, received {key}'

        self.experiment_results[key] = {
            'validation_dataset_name': name,
            'metrics': self.compute_metrics(y_true, y_pred),
            'size': len(y_true),
        }

    def k_fold_results_to_stats(self):
        k_avgs = collections.defaultdict(list)
        for key, val in self.experiment_results.items():
            algorithm_name, _, method, num_clients, name = key
            newKey = (f"{algorithm_name}-{method}-{num_clients}-{name}", name)
            k_avgs[newKey].append(val['metrics']['roc_auc_score'])

        results = collections.defaultdict(dict)
        for key, val in k_avgs.items():
            json_id, dataset = key
            results[dataset][json_id] = {
                'mean': np.mean(val),
                'std': np.std(val)
            }

        return results

    def add_to_kfold_table(self, algorithm_name: str, num_clients: str, split_method: str, val_name: str, y_true, y_pred):
        assert split_method != 'internal_validation' and split_method != 'internal_validation', f'incorrect val name, received {val_name}'
        data = self.compute_metrics(y_true, y_pred)
        row_data = [algorithm_name, num_clients, split_method, val_name]
        row_data.extend(data.values())

        # record validation only once
        if val_name == 'external test' and self.external_val_recorded:
            return

        if not hasattr(self, 'kfold_table'):
            col_names = self.metadata_column_names.copy()

            all_cols = col_names.copy() + list(data.keys())
            self.kfold_table = pd.DataFrame([row_data], columns=all_cols)
        else:
            self.kfold_table.loc[len(self.kfold_table.index)] = row_data

    def write_results(self, path: str):
        os.makedirs(path, exist_ok=True)
        write_path = os.path.join(path, f"{self.experiment_name}.csv")
        self.kfold_table.to_csv(write_path, index=False)
        return write_path




## Define experiment folder in collab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
current_experiment = GlobalExperimentsConfiguration(
    base_path='/content/drive/MyDrive/collab_nih_fl',
    experiment_name='federated_MLP_regression_tf',
    random_seed=0
)

current_experiment.create_experiment(
    dataset_folder='/content/drive/MyDrive/collab_nih_fl/data',
    dataset=GlobalExperimentsConfiguration.MULTIMODALITY,
    split_method=GlobalExperimentsConfiguration.SKLEARN
)

## Load and store the experiment datasets

## Strategy to make dataset split configs (must have parity to local version of experiments)

In [None]:
import tensorflow_datasets as tfds
from tensorflow_federated.python.simulation.datasets import from_tensor_slices_client_data
from tensorflow_federated.python.learning.models import variable
import pickle
import functools

In [None]:
# # tf training data
# NUM_EPOCHS = 10
# BATCH_SIZE = 60
# SHUFFLE_BUFFER = 100
# PREFETCH_BUFFER = 10

# # FL communication rounds:
# NUM_ROUNDS = 10

# # normalize feature ranges since this is will affect neural networks
# def min_max_normalized(data):
#     return data
#     # normalized already
#     # col_max = np.max(data, axis=0)
#     # col_min = np.min(data, axis=0)
#     # return np.divide(data - col_min, col_max - col_min)

# for fold_idx in range(current_experiment.K):
#     current_experiment.set_train_dataset(fold_idx=fold_idx)
#     current_experiment.set_combined_validation_dataset(fold_idx=fold_idx)

#     current_experiment.training_dataset = min_max_normalized(current_experiment.training_dataset)
#     current_experiment.combined_validation_dataset = [(n, min_max_normalized(d)) for n, d in current_experiment.combined_validation_dataset]

#     # generate data for several site configurations
#     # each configuration is a json, so there is no duplication of underlying data
#     site_configs = [1, 2, 3, 4, 5, 6, 7, 8]
#     site_prefixes = ["site-"] * len(site_configs)
#     split_methods = ["uniform"] * len(site_configs)

#     experiment_results = []

#     for i in range(len(site_configs)):
#         num_clients, site_prefix, split_method = site_configs[i], site_prefixes[i], split_methods[i]

#         # training splits
#         client_splits = current_experiment.get_client_splits(
#             dataset=current_experiment.training_dataset,
#             num_clients=num_clients,
#             method=split_method,
#             as_intervals=False
#         )

#         def site_naming_fn(site_index):
#             """Used for naming files in the client data split json"""
#             return f"{site_prefix}{site_index + 1}"

#         # columns per sample includes label
#         samples_per_client, columns_per_sample = client_splits[0].shape
#         client_dict = {}
#         for site_idx, client_split in enumerate(client_splits):

#           # take the PHENO column and make it the last column in
#           # preparation for converting this to a python list
#           features = client_split.drop(columns=['PHENO']).to_numpy()
#           labels = pd.DataFrame(client_split['PHENO']).to_numpy()

#           client_dict[site_naming_fn(site_idx)] = collections.OrderedDict(
#               x = features.tolist(),
#               y = labels.tolist(),
#           )

#         client_splits = from_tensor_slices_client_data.TestClientData(client_dict)

#         def preprocess(dataset):

#           def batch_format_fn(element):
#             """Flatten a batch `pixels` and return the features as an `OrderedDict`."""
#             new = collections.OrderedDict(
#                 x=tf.cast(tf.reshape(element['x'], [-1, columns_per_sample - 1]), tf.float32),
#                 y=tf.cast(tf.reshape(element['y'], [-1, 1]), tf.float32))

#             return new

#           return dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER, seed=current_experiment.RANDOM_SEED).batch(
#               BATCH_SIZE).map(batch_format_fn).prefetch(PREFETCH_BUFFER)

#         # used to compute elment structure after processing
#         # Used to compute the element spec in the TestClientData class, just replicated
#         # for batch_format_fn data
#         example_dataset = client_splits.create_tf_dataset_for_client(client_splits.client_ids[0])
#         preprocessed_example_dataset = preprocess(example_dataset)

#         def make_federated_data(client_data, client_ids):
#           return [
#               preprocess(client_data.create_tf_dataset_for_client(x))
#               for x in client_ids
#           ]

#         # sample_clients = client_splits.client_ids
#         federated_train_data = make_federated_data(client_splits, client_splits.client_ids)

#         # multi-layer perceptron regression
#         def create_keras_model():
#           return tf.keras.models.Sequential([
#               tf.keras.layers.Dense(1, activation='sigmoid', input_shape=(columns_per_sample - 1,)),
#               tf.keras.layers.Dense(100, activation='relu', input_dim=columns_per_sample - 1,),
#               tf.keras.layers.Dense(1, activation='sigmoid')
#           ])

#         loss_fn = lambda: tf.keras.losses.BinaryCrossentropy()

#         metrics_list = lambda: [tf.keras.metrics.BinaryCrossentropy(), tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()]

#         def model_fn(): #-> variable.VariableModel:
#           # We _must_ create a new model here, and _not_ capture it from an external
#           # scope. TFF will call this within different graph contexts.
#           keras_model = create_keras_model()

#           return tff.learning.models.from_keras_model(
#               keras_model,
#               input_spec=preprocessed_example_dataset.element_spec,
#               loss=loss_fn(),
#               metrics=metrics_list()
#             )

#         training_process = tff.learning.algorithms.build_weighted_fed_avg(
#           model_fn,
#           client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.02), # 0.02
#           server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0))

#         train_state = training_process.initialize()
#         best_model_weights = None
#         best_loss = float('inf')
#         for round_num in range(0, NUM_ROUNDS):
#           result = training_process.next(train_state, federated_train_data)
#           train_state = result.state
#           train_metrics = result.metrics
#           print('round {:2d}, metrics={}'.format(round_num, train_metrics))

#           # get test data
#           test_dataset = current_experiment.test_dataset
#           X, y = current_experiment.as_features_labels(test_dataset, current_experiment.LABEL_COL)

#           # copy over model weights from trainer
#           model = create_keras_model()
#           model.set_weights(training_process.get_model_weights(train_state)[0])

#           y_pred = model.predict(X)
#           # y_pred = y_pred.astype(int)

#           print(y.dtype)
#           print(y_pred.dtype)

#           bce = loss_fn()
#           loss = bce(y, y_pred)
#           auc =  metrics.roc_auc_score(y, y_pred)
#           print('test round {:2d}, loss={:.3f}, auc={:.3f}'.format(round_num, loss, auc))

#           if loss < best_loss:
#             best_loss = loss
#             best_model_weights = training_process.get_model_weights(train_state)
#             pickle.dump(best_model_weights, open("current_best_model_weights.pkl", "wb"))
#             pickle.dump(current_experiment, open("current_experiment.pkl", "wb"))

#             checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
#               filepath=checkpoint_prefix,
#               save_weights_only=True)


#         # evaluation_process = tff.learning.algorithms.build_fed_eval(model_fn)
#         # evaluation_state = evaluation_process.initialize()

#         # take trained weights & use them for evaluation
#         model_weights = best_model_weights
#         # Save the entire model, including architecture and weights
#         model.save("model.h5")

#         # Load the model with its architecture and weights
#         model = tf.keras.models.load_model("model.h5")

#         # get the evaluation datasets
#         validation_results = []
#         val_dataset = current_experiment.combined_validation_dataset
#         val_dataset.append(('test', current_experiment.test_dataset))
#         for data in val_dataset:
#             name, dataset = data

#             X, y = current_experiment.as_features_labels(dataset, current_experiment.LABEL_COL)

#             y_pred = model.predict(X)
#             y_pred = y_pred.reshape(-1)
#             y_pred = (y_pred > 0.5).astype("int32")

#             # client_splits = current_experiment.get_client_splits(
#             #     dataset=dataset,
#             #     num_clients=1,
#             #     method=split_method,
#             #     as_intervals=False
#             # )

#             # client_dict = {}
#             # for site_idx, client_split in enumerate(client_splits):

#             #   # take the PHENO column and make it the last column in
#             #   # preparation for converting this to a python list
#             #   features = client_split.drop(columns=['PHENO']).to_numpy()
#             #   labels = pd.DataFrame(client_split['PHENO']).to_numpy()

#             #   client_dict[site_naming_fn(site_idx)] = collections.OrderedDict(
#             #       x = features.tolist(),
#             #       y = labels.tolist(),
#             #   )

#             # client_splits = from_tensor_slices_client_data.TestClientData(client_dict)

#             # federated_val_data = make_federated_data(client_splits, client_splits.client_ids)
#             # # sample_clients = client_splits.client_ids

#             # state, metrics = evaluation_process.next(evaluation_state, federated_val_data)

#             # current_experiment.add_val_result(
#             #     fold_idx=fold_idx,
#             #     num_clients=num_clients,
#             #     split_method=split_method,
#             #     name=name,
#             #     auc=float(metrics['client_work']['eval']['total_rounds_metrics']['auc']),
#             #     size=float(metrics['client_work']['eval']['total_rounds_metrics']['num_examples'])
#             # )
#             current_experiment.add_to_kfold_table(
#                 algorithm_name='Federated MLP',
#                 num_clients=num_clients,
#                 split_method=split_method,
#                 val_name=name,
#                 y_true=y,
#                 y_pred=y_pred
#             )
#     break
# print(current_experiment.experiment_results)


In [None]:
# tf training data
NUM_EPOCHS = 10
BATCH_SIZE = 500
SHUFFLE_BUFFER = 100
PREFETCH_BUFFER = 10

# FL communication rounds:
NUM_ROUNDS = current_experiment.num_rounds

# normalize feature ranges since this is will affect neural networks
def min_max_normalized(data):
    return data
    # normalized already
    # col_max = np.max(data, axis=0)
    # col_min = np.min(data, axis=0)
    # return np.divide(data - col_min, col_max - col_min)

for fold_idx in range(current_experiment.K):
    current_experiment.set_fold(fold_idx=fold_idx)
    current_experiment.set_validation_dataset()

    # generate data for several site configurations
    # each configuration is a json, so there is no duplication of underlying data
    site_configs = [1, 2, 3, 4, 5, 6, 7, 8]
    site_prefixes = ["site-"] * len(site_configs)
    split_methods = ["uniform"] * len(site_configs)

    experiment_results = []

    for i in range(len(site_configs)):
        num_clients, site_prefix, split_method = site_configs[i], site_prefixes[i], split_methods[i]

        # training splits
        client_folds = current_experiment.get_stratified_client_subsets(
            current_experiment.training_dataset,
            num_clients=num_clients,
            method=split_method
        )

        def site_naming_fn(site_index):
            """Used for naming files in the client data split json"""
            return f"{site_prefix}{site_index + 1}"

        # columns per sample includes label
        _, columns_per_sample = client_folds[0].shape
        client_dict = {}
        for site_idx, fold in enumerate(client_folds):

          # take the PHENO column and make it the last column in
          # preparation for converting this to a python list
          features = fold.drop(columns=['PHENO']).to_numpy()
          labels = pd.DataFrame(fold['PHENO']).to_numpy()

          client_dict[site_naming_fn(site_idx)] = collections.OrderedDict(
              x = features.tolist(),
              y = labels.tolist(),
          )

        client_splits = from_tensor_slices_client_data.TestClientData(client_dict)

        def preprocess(dataset):

          def batch_format_fn(element):
            """Flatten a batch `pixels` and return the features as an `OrderedDict`."""
            new = collections.OrderedDict(
                x=tf.cast(tf.reshape(element['x'], [-1, columns_per_sample - 1]), tf.float32),
                y=tf.cast(tf.reshape(element['y'], [-1, 1]), tf.float32))

            return new

          return dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER, seed=current_experiment.RANDOM_SEED).batch(
              BATCH_SIZE).map(batch_format_fn).prefetch(PREFETCH_BUFFER)

        # used to compute elment structure after processing
        # Used to compute the element spec in the TestClientData class, just replicated
        # for batch_format_fn data
        example_dataset = client_splits.create_tf_dataset_for_client(client_splits.client_ids[0])
        preprocessed_example_dataset = preprocess(example_dataset)

        def make_federated_data(client_data, client_ids):
          return [
              preprocess(client_data.create_tf_dataset_for_client(x))
              for x in client_ids
          ]

        # sample_clients = client_splits.client_ids
        federated_train_data = make_federated_data(client_splits, client_splits.client_ids)

        # MLP regression
        def create_keras_model():
          return tf.keras.models.Sequential([
              tf.keras.layers.Dense(1, activation='sigmoid', input_shape=(columns_per_sample - 1,)),
              tf.keras.layers.Dense(100, activation='relu', input_dim=columns_per_sample - 1,),
              tf.keras.layers.Dense(1, activation='sigmoid')
          ])

        loss_fn = lambda: tf.keras.losses.BinaryCrossentropy()

        metrics_list = lambda: [tf.keras.metrics.BinaryCrossentropy(), tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()]

        def model_fn(): #-> variable.VariableModel:
          # We _must_ create a new model here, and _not_ capture it from an external
          # scope. TFF will call this within different graph contexts.
          keras_model = create_keras_model()

          return tff.learning.models.from_keras_model(
              keras_model,
              input_spec=preprocessed_example_dataset.element_spec,
              loss=loss_fn(),
              metrics=metrics_list()
            )

        training_process = tff.learning.algorithms.build_weighted_fed_avg(
          model_fn,
          client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.02), # 0.02
          server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0))

        train_state = training_process.initialize()
        best_model_weights = None
        best_loss = float('inf')
        for round_num in range(0, NUM_ROUNDS):
          result = training_process.next(train_state, federated_train_data)
          train_state = result.state
          train_metrics = result.metrics
          print('round {:2d}, metrics={}'.format(round_num, train_metrics))

          # get test data
          validation_dataset = current_experiment.validation_dataset
          X, y = current_experiment.as_features_labels(validation_dataset, current_experiment.LABEL_COL)

          # copy over model weights from trainer
          model = create_keras_model()
          model.set_weights(training_process.get_model_weights(train_state)[0])

          y_pred = model.predict(X)
          # y_pred = y_pred.astype(int)

          print(y.dtype)
          print(y_pred.dtype)

          bce = loss_fn()
          loss = bce(y, y_pred)
          auc =  metrics.roc_auc_score(y, y_pred)
          print('test round {:2d}, loss={:.3f}, auc={:.3f}'.format(round_num, loss, auc))

          if loss < best_loss:
            best_loss = loss
            best_model_weights = training_process.get_model_weights(train_state)
            pickle.dump(best_model_weights, open("current_best_model_weights.pkl", "wb"))
            pickle.dump(current_experiment, open("current_experiment.pkl", "wb"))


        # evaluation_process = tff.learning.algorithms.build_fed_eval(model_fn)
        # evaluation_state = evaluation_process.initialize()

        # take trained weights & use them for evaluation
        model_weights = best_model_weights
        import pickle
        pickle.dump(model_weights, open("model_weights.pkl", "wb"))
        print(len(model_weights))
        # evaluation_state = evaluation_process.set_model_weights(evaluation_state, model_weights)
        model = create_keras_model()
        model.set_weights(model_weights[0])


        # get the evaluation datasets
        validation_results = []
        for name, dataset in current_experiment.get_combined_test_dataset():

            X, y = current_experiment.as_features_labels(dataset, current_experiment.LABEL_COL)

            y_pred = model.predict(X)
            y_pred = y_pred.reshape(-1)
            y_pred = (y_pred > 0.5).astype("int32")

            current_experiment.add_to_kfold_table(
                algorithm_name='Federated MLP Regression',
                num_clients=num_clients,
                split_method=split_method,
                val_name=name,
                y_true=y,
                y_pred=y_pred
            )

print(current_experiment.experiment_results)


In [None]:
internal_only = current_experiment.kfold_table[current_experiment.kfold_table['val_name'] == 'internal test'].groupby(current_experiment.metadata_column_names)
display(internal_only.mean())
exteral_only = current_experiment.kfold_table[current_experiment.kfold_table['val_name'] == 'external test'].groupby(current_experiment.metadata_column_names)
display(exteral_only.mean())
exteral_only = current_experiment.kfold_table[current_experiment.kfold_table['val_name'] == 'test'].groupby(current_experiment.metadata_column_names)
display(exteral_only.mean())
current_experiment.write_results('/')

In [None]:
# tf training data
NUM_EPOCHS = 10
BATCH_SIZE = 60
SHUFFLE_BUFFER = 100
PREFETCH_BUFFER = 10

NUM_ROUNDS = 150


current_experiment.set_train_dataset(fold_idx=1)
current_experiment.set_combined_validation_dataset(fold_idx=1)

current_experiment.training_dataset = current_experiment.training_dataset
current_experiment.combined_validation_dataset = [(n, d) for n, d in current_experiment.combined_validation_dataset]

num_clients, site_prefix, split_method = 4, "site-", "uniform"

In [None]:
# tf training data
NUM_EPOCHS = 10
BATCH_SIZE = 60
SHUFFLE_BUFFER = 100
PREFETCH_BUFFER = 10

NUM_ROUNDS = 150


current_experiment.set_train_dataset(fold_idx=fold_idx)
current_experiment.set_combined_validation_dataset(fold_idx=fold_idx)

current_experiment.training_dataset = min_max_normalized(current_experiment.training_dataset)
current_experiment.combined_validation_dataset = [(n, min_max_normalized(d)) for n, d in current_experiment.combined_validation_dataset]

# generate data for several site configurations
# each configuration is a json, so there is no duplication of underlying data
site_configs = [1, 2, 3, 4, 5, 6, 7, 8]
site_prefixes = ["site-"] * len(site_configs)
split_methods = ["uniform"] * len(site_configs)

experiment_results = []

i = 1
num_clients, site_prefix, split_method = site_configs[i], site_prefixes[i], split_methods[i]

# training splits
client_splits = current_experiment.get_client_splits(
    dataset=current_experiment.training_dataset,
    num_clients=num_clients,
    method=split_method,
    as_intervals=False
)

def site_naming_fn(site_index):
    """Used for naming files in the client data split json"""
    return f"{site_prefix}{site_index + 1}"

# columns per sample includes label
samples_per_client, columns_per_sample = client_splits[0].shape
client_dict = {}
for site_idx, client_split in enumerate(client_splits):

  # take the PHENO column and make it the last column in
  # preparation for converting this to a python list
  features = client_split.drop(columns=['PHENO']).to_numpy()
  labels = pd.DataFrame(client_split['PHENO']).to_numpy()

  client_dict[site_naming_fn(site_idx)] = collections.OrderedDict(
      x = features.tolist(),
      y = labels.tolist(),
  )

client_splits = from_tensor_slices_client_data.TestClientData(client_dict)

def preprocess(dataset):

  def batch_format_fn(element):
    """Flatten a batch `pixels` and return the features as an `OrderedDict`."""
    new = collections.OrderedDict(
        x=tf.cast(tf.reshape(element['x'], [-1, columns_per_sample - 1]), tf.float32),
        y=tf.cast(tf.reshape(element['y'], [-1, 1]), tf.float32))

    return new

  return dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER, seed=current_experiment.RANDOM_SEED).batch(
      BATCH_SIZE).map(batch_format_fn).prefetch(PREFETCH_BUFFER)

# used to compute elment structure after processing
# Used to compute the element spec in the TestClientData class, just replicated
# for batch_format_fn data
example_dataset = client_splits.create_tf_dataset_for_client(client_splits.client_ids[0])
preprocessed_example_dataset = preprocess(example_dataset)

def make_federated_data(client_data, client_ids):
  return [
      preprocess(client_data.create_tf_dataset_for_client(x))
      for x in client_ids
  ]

# sample_clients = client_splits.client_ids
federated_train_data = make_federated_data(client_splits, client_splits.client_ids)

# multi-layer perceptron regression
def create_keras_model():
  return tf.keras.models.Sequential([
      tf.keras.layers.Dense(1, activation='sigmoid', input_shape=(columns_per_sample - 1,)),
      tf.keras.layers.Dense(100, activation='relu', input_dim=columns_per_sample - 1,),
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])

loss_fn = lambda: tf.keras.losses.BinaryCrossentropy()

metrics_list = lambda: [tf.keras.metrics.BinaryCrossentropy(), tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()]

def model_fn(): #-> variable.VariableModel:
  # We _must_ create a new model here, and _not_ capture it from an external
  # scope. TFF will call this within different graph contexts.
  keras_model = create_keras_model()

  return tff.learning.models.from_keras_model(
      keras_model,
      input_spec=preprocessed_example_dataset.element_spec,
      loss=loss_fn(),
      metrics=metrics_list()
    )

training_process = tff.learning.algorithms.build_weighted_fed_avg(
  model_fn,
  client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.02), # 0.02
  server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0))

train_state = training_process.initialize()
best_model_weights = None
best_loss = float('inf')

In [None]:
training_process.__dict__

In [None]:
training_process.get_hparams()

In [None]:
# model_weights = best_model_weights
# print(len(model_weights))
# # evaluation_state = evaluation_process.set_model_weights(evaluation_state, model_weights)
# model = create_keras_model()
# model.set_weights(model_weights.trainable)

In [None]:

# evaluation_process = tff.learning.algorithms.build_fed_eval(model_fn)
# evaluation_state = evaluation_process.initialize()

# take trained weights & use them for evaluation
import pickle
model_weights = pickle.load(open("model_weights.pkl", "rb"))
print(len(model_weights))
# evaluation_state = evaluation_process.set_model_weights(evaluation_state, model_weights)
model = create_keras_model()
# model.set_weights(model_weights[0])
model.set_weights(training_process.get_model_weights(train_state)[0])


# get the evaluation datasets
validation_results = []
name, dataset = current_experiment.combined_validation_dataset[1]

X, y = current_experiment.as_features_labels(dataset, current_experiment.LABEL_COL)

y_pred = model.predict(X)
y_pred = y_pred.reshape(-1)
for i in range(0, len(y)):
  print(y[i], y_pred[i])
y_pred = (y_pred > 0.5).astype("int32")
for i in range(0, len(y)):
  print(y[i], y_pred[i])
    # client_splits = current_experiment.get_client_splits(
    #     dataset=dataset,
    #     num_clients=1,
    #     method=split_method,
    #     as_intervals=False
    # )

    # client_dict = {}
    # for site_idx, client_split in enumerate(client_splits):

    #   # take the PHENO column and make it the last column in
    #   # preparation for converting this to a python list
    #   features = client_split.drop(columns=['PHENO']).to_numpy()
    #   labels = pd.DataFrame(client_split['PHENO']).to_numpy()

    #   client_dict[site_naming_fn(site_idx)] = collections.OrderedDict(
    #       x = features.tolist(),
    #       y = labels.tolist(),
    #   )

    # client_splits = from_tensor_slices_client_data.TestClientData(client_dict)

    # federated_val_data = make_federated_data(client_splits, client_splits.client_ids)
    # # sample_clients = client_splits.client_ids

    # state, metrics = evaluation_process.next(evaluation_state, federated_val_data)

    # current_experiment.add_val_result(
    #     fold_idx=fold_idx,
    #     num_clients=num_clients,
    #     split_method=split_method,
    #     name=name,
    #     auc=float(metrics['client_work']['eval']['total_rounds_metrics']['auc']),
    #     size=float(metrics['client_work']['eval']['total_rounds_metrics']['num_examples'])
    # )
    # current_experiment.add_to_kfold_table(
    #     algorithm_name='Federated Logistic Regression',
    #     num_clients=num_clients,
    #     split_method=split_method,
    #     val_name=name,
    #     y_true=y,
    #     y_pred=y_pred
    # )

In [None]:
# current_experiment.kfold_table