# Vertical Federated Learning with the Adult-income dataset

In this notebook we show how to use FLEXible to simulate a Vertical Federated Learning (VFL) scenario with a neural network using Pytorch. We implement the VFL process described in paper: [Vertical Federated Learning: Challenges, Methodologies and Experiments](https://arxiv.org/abs/2202.04309)

First, we download the raw dataset using the `ucimlrepo` package.

In [None]:
from flex.model import FlexModel
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

We preprocess it, making the targets integers and removing rows with NaN

In [None]:
import pandas as pd

# data (as pandas dataframes)
x_cols = adult.data.features.columns.tolist()
y_cols = adult.data.targets.columns.tolist()
adult = pd.concat([adult.data.features, adult.data.targets], axis=1)
adult = adult.dropna()
x_data = adult[x_cols]
y_data = adult[y_cols]
# Transform string labels ">50K", "<=50K" to integer labels 1, 0
y_data = y_data["income"].apply(lambda label: ">" in label).astype(int)

We create a `Dataset` object using the preprocessed dataset and a `FedDatasetConfig` object showing how we want to simulate the vertical split: 

There are three nodes, one `host` and two `guest`, only the former keeps the labels and each one has 4 features. As we are performing a vertical split, we need to provide weights equal to one, `replacement=False` and `shuffle=False`, to ensure that each node gets the same data points but different features.

In [None]:
from flex.data import Dataset, FedDatasetConfig, FedDataDistribution

dataset = Dataset.from_numpy(x_data.to_numpy(), y_data.to_numpy())

config = FedDatasetConfig(
    seed=0,
    n_nodes=3,
    node_ids=["host", "guest_1", "guest_2"],
    shuffle=False,
    replacement=True,
    keep_labels=[True, False, False],
    weights=[
        1.0,
        1.0,
        1.0,
    ],  # Ensure that each node gets the entire dataset, not a subset
    features_per_node=[
        [0, 1, 2, 3],  # ['age', 'workclass', 'fnlwgt', 'education'] for "host" node
        [
            4,
            5,
            6,
            7,
        ],  # ['education-num', 'marital-status', 'occupation', 'relationship'] for "guest_1" node
        [
            8,
            9,
            10,
            11,
            12,
            13,
        ],  # ['race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'] for "guest_2" node
    ],
)

fed_dataset = FedDataDistribution.from_config(dataset, config)

Once we have split the data among nodes, we one hot encode each local dataset. Beware, only the "host" node has labels.

In [None]:
from sklearn import preprocessing
import numpy as np


def preprocess_x_data(local_dataset: Dataset):
    ohe_encoder = preprocessing.OneHotEncoder(
        sparse_output=False, handle_unknown="ignore"
    )
    x_data = local_dataset.X_data.to_numpy()
    y_data = local_dataset.y_data
    encoded_x_data = ohe_encoder.fit_transform(x_data)
    return Dataset.from_numpy(encoded_x_data, y_data)


# One hot encode each dataset individually
fed_dataset = fed_dataset.apply(preprocess_x_data)

Once data is federated, we assign a `FlexRole` to each `node_id`, to create a `FlexPool` which simulates the Vertical Federated Learning flow.

In [None]:
from flex.actors import FlexRole, FlexActors

actors = FlexActors()
actors["host"] = FlexRole.server_aggregator_client
actors["guest_1"] = FlexRole.client
actors["guest_2"] = FlexRole.client

from flex.pool import FlexPool

pool = FlexPool(fed_dataset, actors)

The VFL model in this simulation is a simple fully-connected network with three layers of 48, 96 and 196 hidden units. The bottom model has 48 hidden units and its output size is 32. The top model has 196 hidden units, its input size is 32*3=96 and it outputs the final predictions of the model.

In [None]:
import torch


class MLP(torch.nn.Module):
    def __init__(self, input_dim, hidden_units, out_dim):
        super().__init__()
        self.input_layer = torch.nn.Linear(input_dim, hidden_units)
        self.output_layer = torch.nn.Linear(hidden_units, out_dim)

    def forward(self, x):
        x = self.input_layer(x)
        x = self.output_layer(x)
        return x

Each node owns the bottom model, which has 48 hidden units and its output dimension is 32

In [None]:
def initialize_bottom_model(local_model: FlexModel, local_dataset: Dataset):
    input_dim = local_dataset.X_data.to_numpy().shape[-1]
    hidden_units = 48
    out_dim = 32
    local_model["bottom_model"] = MLP(input_dim, hidden_units, out_dim)
    local_model["bottom_optimizer"] = torch.optim.Adam(
        local_model["bottom_model"].parameters(), lr=0.0002, weight_decay=0.01
    )


pool.map(initialize_bottom_model)

The host node also has the top part of the model, which receives as input the outputs of the bottom model, this its input dimension is 32*3=96

In [None]:
def initialize_top_model(local_model: FlexModel, local_dataset: Dataset):
    input_dim = 32 * 3
    hidden_units = 196
    out_dim = 1
    local_model["loss"] = torch.nn.BCELoss()
    local_model["top_model"] = MLP(input_dim, hidden_units, out_dim)
    local_model["top_optimizer"] = torch.optim.Adam(
        local_model["top_model"].parameters(), lr=0.0002, weight_decay=0.01
    )


# The host has both the bottom and the top model
host_pool = pool.select(lambda node_id, role: node_id == "host")
host_pool.map(initialize_top_model)

The following functions implement the logic of a FL round:

In [None]:
def setup_batch_sampler(local_model: FlexModel, local_dataset: Dataset, seed=0):
    from sklearn.model_selection import train_test_split
    from torch.utils.data import DataLoader, RandomSampler

    common_random_state = 1001
    test_size = 0.2
    batch_size = 500
    # Perform train-test split here, in order to generate
    if local_model.actor_id == "host":
        X_train, X_test, y_train, y_test = train_test_split(
            local_dataset.X_data,
            local_dataset.y_data,
            test_size=test_size,
            random_state=common_random_state,
        )
        train_dataset = Dataset.from_list(X_train, y_train)
        test_dataset = Dataset.from_list(X_test, y_test)
    else:
        train_dataset, test_dataset = train_test_split(
            local_dataset.X_data, test_size=test_size, random_state=common_random_state
        )
    # Same seed for all RandomSamplers
    generator = torch.Generator()
    generator = generator.manual_seed(seed)
    train_sampler = RandomSampler(train_dataset, replacement=False, generator=generator)
    local_model["train_batch_sampler"] = iter(
        DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)
    )
    local_model["test_batch_sampler"] = iter(
        DataLoader(test_dataset, batch_size=len(test_dataset))
    )


def generate_bottom_outputs(local_model: FlexModel, local_dataset: Dataset):
    bottom_model = local_model["bottom_model"]
    try:
        data = next(local_model["train_batch_sampler"])
        if local_model.actor_id == "host":
            batch_data, local_model["y_batch"] = data
        else:
            batch_data = data
        local_model["bottom_optimizer"].zero_grad()
        local_model["bottom_output"] = bottom_model(batch_data.float())
        return True
    except StopIteration:
        return False


def forward_bottom_outputs(host_model: FlexModel, models):
    bottom_outputs = []
    for k in models:
        bottom_outputs.append(models[k]["bottom_output"])
    bottom_outputs = torch.cat(bottom_outputs, axis=1)
    host_model["top_optimizer"].zero_grad()
    top_outputs = host_model["top_model"](bottom_outputs)
    logits = torch.sigmoid(top_outputs)
    host_model["logits"] = logits.squeeze(axis=1)


def backward_pass_top(local_model: FlexModel, local_dataset: Dataset):
    logits = local_model["logits"]
    loss = local_model["loss"](logits, local_model["y_batch"].float())
    loss.backward()
    local_model["top_optimizer"].step()


def backward_pass_bottom(local_model: FlexModel, local_dataset: Dataset):
    local_model["bottom_optimizer"].step()

With the above-defined functions an FL round is as follows:

- First, all nodes agree to iterate their datasets in such a way that their features align using `setup_batch_sampler`
- Then, while possible each node generates the bottom output of its data using `generate_bottom_outputs`
- The bottom outputs are gathered by the host and feeded to the top model `forward_bottom_outputs`
- Now, the backward pass begins, first the top model is updated using `backward_pass_top`, then the bottom models are updated using `backward_pass_bottom`.
- This process is repeated until the nodes have iterated their full dataset.

In [None]:
pool.map(setup_batch_sampler)

while all(pool.map(generate_bottom_outputs)):
    host_pool.map(forward_bottom_outputs, pool)
    host_pool.map(backward_pass_top)
    pool.map(backward_pass_bottom)

Still, we are missing the code to evaluate the performance of the whole model, which is as follows:

1. Generate the bottom outputs of the test set
1. The host gathers the bottom outputs, feeds them to the top model and computes the scores. Note that, the host node is the only node with labels.

In [None]:
def generate_test_bottom_outputs(local_model: FlexModel, local_dataset: Dataset):
    with torch.no_grad():
        bottom_model = local_model["bottom_model"]
        data = next(local_model["test_batch_sampler"])
        if local_model.actor_id == "host":
            batch_data, local_model["test_y"] = data
        else:
            batch_data = data
        local_model["test_bottom_output"] = bottom_model(batch_data.float())


def evalute_test_bottom_outputs(host_model: FlexModel, models):
    from sklearn.metrics import accuracy_score, roc_auc_score

    with torch.no_grad():
        bottom_outputs = []
        for k in models:
            bottom_outputs.append(models[k]["test_bottom_output"])
        bottom_outputs = torch.cat(bottom_outputs, axis=1)
        top_outputs = host_model["top_model"](bottom_outputs)
        logits = torch.sigmoid(top_outputs)
        logits = logits.squeeze(axis=1)
        acc = accuracy_score(host_model["test_y"], (logits >= 0.5).long())
        auc = roc_auc_score(host_model["test_y"], logits)
        print(f"test auc: {auc:.4f}, test acc: {acc:.4f}")

Consequentially, a full VFL simulation, where every 5 rounds the whole model is evaluated:

In [None]:
fl_rounds = 100

for i in range(fl_rounds):
    globa_seed = int(torch.empty((), dtype=torch.int32).random_().item())
    pool.map(setup_batch_sampler, seed=globa_seed)
    while all(pool.map(generate_bottom_outputs)):  # Train
        host_pool.map(forward_bottom_outputs, pool)
        host_pool.map(backward_pass_top)
        pool.map(backward_pass_bottom)
    if (i + 1) % 5 == 0:  # Evaluate
        print(f"FL Round {i+1}", end=" ")
        pool.map(generate_test_bottom_outputs)
        host_pool.map(evalute_test_bottom_outputs, pool)