# Testing federated learning
The purpose of this notebook is to run a data analysis pipeline on vantage 6.

The outcome of the pipeline is irrelevant so far.  This notebook is mainly a POC for creating a data analysis client-side and using it to train a model on a federated dataset.

In [2]:
import vantage6.client as vtgclient
from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn import pipeline
from sklearn.impute import SimpleImputer
from run_task import print_result, print_logs
import numpy as np

In [3]:
HOST = 'http://localhost'
PORT = 5001
IMAGE ='localhost:5000/v6-carrier-py'

USERNAME = 'admin'
PASSWORD = 'admin'

COLLABORATION_ID = 1
MASTER_ORGANIZATION = [1]
RPC_ORGANIZATIONS = [2,3, 6]

KEYS = ['GBAGeboorteJaar', 'GBAGeboorteMaand', 'GBAGeboorteDag', 'GBAGeslacht',
        'GBAPostcode', 'GBAHuisnummer', 'GBAToev']


FEATURES = ['height', 'weight', 'bmi', 'Age', 'n_smokingcat4', 'WOZ', 'N_ALCOHOL_CAT',
            'ZVWKOPHOOGFACTOR_2010', 'ZVWKHUISARTS_2010', 'ZVWKFARMACIE_2010', 'ZVWKZIEKENHUIS_2010', 'ZVWKPARAMEDISCH_2010', 'ZVWKZIEKENVERVOER_2010',
        'ZVWKBUITENLAND_2010', 'ZVWKOVERIG_2010', 'ZVWKEERSTELIJNSPSYCHO_2010', 'ZVWKGGZ_2010',
        'ZVWKHULPMIDDEL_2010', 'ZVWKOPHOOGFACTOR_2011', 'ZVWKHUISARTS_2011']
TARGET = 'N_CVD'
client = vtgclient.Client(HOST, PORT)
client.authenticate(USERNAME, PASSWORD)
client.setup_encryption(None)

In [4]:
# Retrieve columns names to check if a simple algorithm succeeds
task = client.post_task('column_names', image=IMAGE, collaboration_id=COLLABORATION_ID,
                        organization_ids=RPC_ORGANIZATIONS,
                        input_={'method': 'column_names', 'master': False})

In [6]:
# It will take a few seconds before results are returned
result = client.get_results(task_id=task['id'])
print_result(result)
print_logs(result)

0: ['GBAGeboorteJaar', 'GBAGeboorteMaand', 'GBAGeboorteDag', 'GBAGeslacht', 'GBAPostcode', 'GBAHuisnummer', 'GBAToev', 'ZVWKOPHOOGFACTOR_2010', 'ZVWKHUISARTS_2010', 'ZVWKFARMACIE_2010', 'ZVWKZIEKENHUIS_2010', 'ZVWKPARAMEDISCH_2010', 'ZVWKZIEKENVERVOER_2010', 'ZVWKBUITENLAND_2010', 'ZVWKOVERIG_2010', 'ZVWKEERSTELIJNSPSYCHO_2010', 'ZVWKGGZ_2010', 'ZVWKHULPMIDDEL_2010', 'ZVWKOPHOOGFACTOR_2011', 'ZVWKHUISARTS_2011', 'ZVWKFARMACIE_2011', 'ZVWKZIEKENHUIS_2011', 'ZVWKPARAMEDISCH_2011', 'ZVWKZIEKENVERVOER_2011', 'ZVWKBUITENLAND_2011', 'ZVWKOVERIG_2011', 'ZVWKEERSTELIJNSPSYCHO_2011', 'ZVWKGGZ_2011', 'ZVWKHULPMIDDEL_2011', 'ZVWKOPHOOGFACTOR_2012', 'ZVWKHUISARTS_2012', 'ZVWKFARMACIE_2012', 'ZVWKZIEKENHUIS_2012', 'ZVWKPARAMEDISCH_2012', 'ZVWKZIEKENVERVOER_2012', 'ZVWKBUITENLAND_2012', 'ZVWKOVERIG_2012', 'ZVWKEERSTELIJNSPSYCHO_2012', 'ZVWKGGZ_2012', 'ZVWKHULPMIDDEL_2012', 'ZVWKOPHOOGFACTOR_2013', 'ZVWKHUISARTS_2013', 'ZVWKFARMACIE_2013', 'ZVWKZIEKENHUIS_2013', 'ZVWKPARAMEDISCH_2013', 'ZVWKZIEKENVER

In [29]:
pipe = pipeline.make_pipeline(SimpleImputer(missing_values=0), StandardScaler(), LinearRegression())

pipe.get_params()

{'memory': None,
 'steps': [('simpleimputer', SimpleImputer()),
  ('standardscaler', StandardScaler()),
  ('linearregression', LinearRegression())],
 'verbose': False,
 'simpleimputer': SimpleImputer(),
 'standardscaler': StandardScaler(),
 'linearregression': LinearRegression(),
 'simpleimputer__add_indicator': False,
 'simpleimputer__copy': True,
 'simpleimputer__fill_value': None,
 'simpleimputer__missing_values': nan,
 'simpleimputer__strategy': 'mean',
 'simpleimputer__verbose': 0,
 'standardscaler__copy': True,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True,
 'linearregression__copy_X': True,
 'linearregression__fit_intercept': True,
 'linearregression__n_jobs': None,
 'linearregression__normalize': False}

In [27]:
for step in pipe.steps:
    print(type(step[1]).__name__)

SimpleImputer
StandardScaler
LinearRegression


In [49]:
task = client.post_task('fit_pipeline', image=IMAGE, collaboration_id=COLLABORATION_ID,
                        organization_ids=MASTER_ORGANIZATION,
                        input_={'method': 'fit_pipeline', 'master': True,
                            'kwargs': {'pipe': pipe, 'features': FEATURES, 'target': TARGET,
                                      'keys': KEYS, 'exclude_orgs': MASTER_ORGANIZATION + [3]}})

In [56]:
result = client.get_results(task_id=task['id'])
print_result(result)
print_logs(result)

0: None
Log for organization 1
info > wrapper for v6_carrier_py
info > Reading input file /mnt/data/task-000000233/input
info > Reading token file '/mnt/data/task-000000233/token'
info > Using '/mnt/data/database.csv' as database
info > Dispatching ...
info > Module 'v6_carrier_py' imported!
info > Running a master-container
info > Working with collaboration_id <1>
info > Training pipeline with the following steps: {'simpleimputer': SimpleImputer(), 'standardscaler': StandardScaler(), 'linearregression': LinearRegression()}
info > Organizations in my collaboration: [{'_public_key': '', 'address1': 'my address 1, Amsterdam', 'address2': None, 'country': 'the Netherlands', 'domain': None, 'id': 1, 'name': 'NLEsC', 'zipcode': '1234ab'}, {'_public_key': '', 'address1': 'my address 1, Amsterdam', 'address2': None, 'country': 'the Netherlands', 'domain': None, 'id': 2, 'name': 'organization for node 0', 'zipcode': '1234ab'}, {'_public_key': '', 'address1': 'my address 1, Amsterdam', 'address

In [11]:
# The actual result of the algorithm
result[0]['result']

24.943656525556797