⚠️ First verify that the Datasites are already running. If needed, launch the following command in a new terminal session:

$ python launch_datasites.py

In [7]:
import syft as sy

In [8]:
from datasites import DATASITE_URLS

datasites = {}
for name, url in DATASITE_URLS.items():
    datasites[name] = sy.login(url=url, email="researcher@ieeta.pt", password="****")

Logged into <silo1: High side Datasite> as <researcher@ieeta.pt>
Logged into <silo2: High side Datasite> as <researcher@ieeta.pt>
Logged into <silo3: High side Datasite> as <researcher@ieeta.pt>


In [9]:
mock_data = datasites["silo1"].datasets[0].assets[0].mock


In [10]:
mock_data

Unnamed: 0,patient_id,sex,diagnostic_delay,age_at_diagnosis,FVC,weight,bmi,measure_days_from_diagnosis,active_calories,basal_calories,...,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12
0,b83c1c5e,0,2.000000,65,85.0,,26.196187,287.0,395.3,1665.6,...,4,3,3,0,2,2,1,4,3,4
1,0b59fa1e,1,0.000000,44,44.0,83.0,27.660096,1109.0,478.7,1723.5,...,4,0,2,2,4,2,2,4,2,4
2,2d348f42,1,0.580822,28,101.0,66.0,18.424036,312.5,828.3,1765.2,...,0,0,1,3,4,2,1,2,4,2
3,6212bcc2,1,2.416438,47,88.0,80.0,27.120316,228.0,662.7,1640.9,...,4,2,1,4,2,0,3,3,4,4
4,5c65b4d1,1,0.413699,15,107.0,79.0,28.040378,208.9,103.9,1702.2,...,4,2,1,1,0,0,0,1,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2d348f42,1,0.375342,43,105.0,54.0,21.100705,53.0,1544.3,1731.0,...,1,4,2,2,0,2,1,4,3,1
496,0615d2f8,1,1.334247,64,105.0,64.0,28.068918,326.0,403.9,1614.7,...,4,2,0,1,1,1,1,1,4,4
497,0d42590b,0,0.747945,51,66.0,54.0,32.724617,233.5,503.0,1726.0,...,4,0,4,0,1,2,0,3,3,4
498,c0d49276,1,1.408219,65,101.0,57.0,33.346481,154.9,82.8,1430.0,...,4,0,0,2,2,2,4,4,3,3


In [11]:
import numpy as np
import numpy.typing as npt
from typing import Union, TypeVar, Any, TypedDict, TypeVar
import pandas as pd

DataFrame = TypeVar("pandas.DataFrame")
NDArray = npt.NDArray[Any]
NDArrayInt = npt.NDArray[np.int_]
NDArrayFloat = npt.NDArray[np.float_]
Dataset = TypeVar("Dataset", bound=tuple[NDArrayFloat, NDArrayInt])

class DataParamsDict(TypedDict):
    target: str
    ignored_columns: list[Any]

class ModelParamsDict(TypedDict):
    model: bytes
    n_base_estimators: int
    n_incremental_estimators: int
    train_size: float
    sample_size: int

DataParams = TypeVar("DataParams", bound=DataParamsDict)
ModelParams = TypeVar("ModelParams", bound=ModelParamsDict)

def ml_experiment(data: DataFrame, dataParams: DataParams, modelParams: ModelParams) -> dict:
    # preprocessing
    print("ml_experiment")
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    import cloudpickle
    import pickle

    def preprocess(data: DataFrame) -> tuple[Dataset, Dataset]:

        # Step 1: Prepare the data for training
        # Drop rows with missing values in Q1
        data = data.dropna(subset=[dataParams["target"]])

        # Separate features and target variable (Q1)
        # TODO: ignorar também patient_id?
        y = data[dataParams["target"]]
        X = data.drop(dataParams["ignored_columns"], axis=1)

        # Step 2: Split the data into training and testing sets
        X_train, _, y_train, _ = train_test_split(X, y, train_size=modelParams["train_size"], stratify=y, random_state=42)

        return X_train, y_train

    def train(model, training_data: tuple[pd.DataFrame, pd.Series]) -> RandomForestClassifier:
        X_train, y_train = training_data
        model.fit(X_train, y_train)
        return model
    
    # Preprocess data
    training_data = preprocess(data)
    print("training_data", len(training_data[0]))
    if modelParams["model"]:
        model = modelParams["model"]
        clf = pickle.loads(model)
        clf.n_estimators += modelParams["n_incremental_estimators"]
    else:
        clf = RandomForestClassifier(random_state=42, n_estimators=modelParams["n_base_estimators"], warm_start=True)
    
    clf = train(clf, training_data)

    return ModelParamsDict(model=cloudpickle.dumps(clf), n_base_estimators=clf.n_estimators, n_incremental_estimators=modelParams["n_incremental_estimators"], train_size=modelParams["train_size"], sample_size=len(training_data[0]))


In [12]:
questions = ['Q' + str(i) for i in range(1, 13)]
dataParams = {
    "target": "Q1",
    "ignored_columns": ["patient_id", "source"] + questions
}

modelParams = {
    "model": None,
    "n_base_estimators": 100,
    "n_incremental_estimators": 1,
    "train_size": 0.2,
    "sample_size": None
}

model_data1 = ml_experiment(mock_data, dataParams, modelParams)



ml_experiment
training_data 100


In [13]:
import pickle
model_data2= ml_experiment(mock_data, dataParams, model_data1) 
model_data3= ml_experiment(mock_data, dataParams, model_data2)
print(pickle.loads(model_data2["model"]).n_estimators)
print(pickle.loads(model_data3["model"]).n_estimators)

ml_experiment
training_data 100
ml_experiment
training_data 100
101
102


In [14]:
from syft.service.policy.policy import MixedInputPolicy

for site in datasites:
    data_asset = datasites[site].datasets[0].assets[0]
    client = datasites[site]
    syft_fl_experiment = sy.syft_function(
        input_policy=MixedInputPolicy(
            client=client,
            data=data_asset,
            dataParams=dict,
            modelParams=dict
        )
    )(ml_experiment)
    ml_training_project = sy.Project(
        name="ML Experiment for FL",
        description="""Test project to run a ML experiment""",
        members=[client],
    )
    ml_training_project.create_code_request(syft_fl_experiment, client)
    project = ml_training_project.send()

In [15]:
from utils import check_status_last_code_requests
check_status_last_code_requests(datasites)

Datasite: silo1


"<class 'syft.service.code.user_code.UserCodeStatusCollection'> approved"

Datasite: silo2


"<class 'syft.service.code.user_code.UserCodeStatusCollection'> approved"

Datasite: silo3


"<class 'syft.service.code.user_code.UserCodeStatusCollection'> approved"

In [16]:
fl_epochs = 3
for epoch in range(fl_epochs):
    print(f"\nEpoch {epoch + 1}/{fl_epochs}")

    for name, datasite in datasites.items():
        print(f"Training on {name}...")

        data_asset = datasite.datasets["silo1"].assets[0]
        print(data_asset)
        print(dataParams)
        print(modelParams)
        modelParamsDict = datasite.code.ml_experiment(
            data=data_asset, dataParams=dataParams, modelParams=modelParams
        ).get_from(datasite)
    


Epoch 1/3
Training on silo1...
Asset(name='Asset', server_uid='a9d0d2982bac4b239c96cd0ba81b799c', action_id='cd4f1bf4105749a698a8f05f950940dd')
{'target': 'Q1', 'ignored_columns': ['patient_id', 'source', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12']}
{'model': None, 'n_base_estimators': 100, 'n_incremental_estimators': 1, 'train_size': 0.2, 'sample_size': None}
