# Testing federated learning
The purpose of this notebook is to run a data analysis pipeline on vantage 6.

The outcome of the pipeline is irrelevant so far.  This notebook is mainly a POC for creating a data analysis client-side and using it to train a model on a federated dataset.

In [1]:
import vantage6.client as vtgclient
from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression, GammaRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import pipeline
from sklearn.impute import SimpleImputer
from run_task import print_result, print_logs
from pathlib import Path
import numpy as np

In [2]:
# Setup remote host
HOST = 'http://host'
PORT = 5000
IMAGE ='carrier-harbor2.carrier-mu.surf-hosted.nl/carrier/v6-test-py'
USERNAME = 'dummy'
PASSWORD = 'fake'
COLLABORATION_ID = 1
ORGANIZATION_IDS = [1]


MASTER_ORGANIZATION = [1]
RPC_ORGANIZATIONS = [2,3, 6]

TARGET = 'N_CVD'
client = vtgclient.Client(HOST, PORT)
client.authenticate(USERNAME, PASSWORD)


#client.setup_encryption(None)
client.setup_encryption(Path('/home/jovyan/app/node_rsa'))

In [3]:
# Setup local host
HOST = 'http://vserver'
PORT = 5000
IMAGE = 'harbor2.vantage6.ai/testing/v6-test-py'

USERNAME = 'admin'
PASSWORD = 'admin'

COLLABORATION_ID = 1
ORGANIZATION_IDS = [2, 3, 6]

client = vtgclient.Client(HOST, PORT)
client.authenticate(USERNAME, PASSWORD)

client.setup_encryption(None)

In [14]:
# Executing a task
task = client.post_task('mytask', image='harbor2.vantage6.ai/testing/v6-test-py', 
                        collaboration_id=COLLABORATION_ID,
                        organization_ids=[2],
                        input_={'method': 'column_names', 'master':True, 'kwargs': {'output_format': 'json', 'exclude_orgs': [3]}})

In [15]:
task

{'parent': None,
 'children': None,
 'run_id': 4,
 'image': 'harbor2.vantage6.ai/testing/v6-test-py',
 'id': 7,
 'database': '',
 'results': [{'id': 7, 'link': '/api/result/7', 'methods': ['GET', 'PATCH']}],
 'name': 'mytask',
 'description': '',
 'initiator': 2,
 'complete': False,
 'collaboration': {'id': 1,
  'link': '/api/collaboration/1',
  'methods': ['DELETE', 'GET', 'PATCH']}}

In [18]:
# It will take a few seconds before results are returned
result = client.get_results(task_id=task['id'])
print_result(result)
print_logs(result)

0: {'s2', 's4', 'bmi', 's3', 's1', 's5', 'bp', 's6', 'sex', 'age'}
Log for organization 2
info > wrapper for v6_test_py
info > Reading input file /mnt/data/task-000000007/input
info > No data format specified. Assuming input data is pickle format
info > Reading token file '/mnt/data/task-000000007/token'
info > Using '/mnt/data/database.csv' as database
info > Dispatching ...
info > Module 'v6_test_py' imported!
info > Running a master-container
info > Working with collaboration_id <1>
info > Calling column names tasks on all organizations within collaboration except [3]
info > Organizations in my collaboration: [{'address1': 'my address 1, Amsterdam', 'address2': None, 'collaborations': [{'id': 1, 'link': '/api/collaboration/1', 'methods': ['DELETE', 'GET', 'PATCH']}], 'country': 'the Netherlands', 'domain': '', 'id': 2, 'name': 'dummy', 'nodes': [{'id': 3, 'link': '/api/node/3', 'methods': ['DELETE', 'GET', 'PATCH']}], 'public_key': 'LS0tLS1CRUdJTiBQVUJMSUMgS0VZLS0tLS0KTUlJQ0lqQU5CZ2

In [40]:
pipe = pipeline.make_pipeline(SimpleImputer(missing_values=0), StandardScaler(), LinearRegression())

In [41]:
task = client.post_task('fit_pipeline', image=IMAGE, collaboration_id=COLLABORATION_ID,
                        organization_ids=MASTER_ORGANIZATION,
                        input_={'method': 'fit_pipeline', 'master': True,
                            'kwargs': {'pipe': pipe, 'features': FEATURES, 'target': TARGET,
                                      'keys': KEYS, 'exclude_orgs': MASTER_ORGANIZATION + [3]}})

In [43]:
result = client.get_results(task_id=task['id'])
print_result(result)
print_logs(result)

0: 25.91429672367083
Log for organization 1
info > wrapper for v6_carrier_py
info > Reading input file /mnt/data/task-000000671/input
info > No data format specified. Assuming input data is pickle format
info > Reading token file '/mnt/data/task-000000671/token'
info > Using '/mnt/data/database.csv' as database
info > Dispatching ...
info > Module 'v6_carrier_py' imported!
info > Running a master-container
info > Working with collaboration_id <1>
info > Training pipeline with the following steps: {'simpleimputer': SimpleImputer(), 'standardscaler': StandardScaler(), 'linearregression': LinearRegression()}
info > Organizations in my collaboration: [{'_public_key': '-----BEGIN PUBLIC KEY-----\nMIICIjANBgkqhkiG9w0BAQEFAAOCAg8AMIICCgKCAgEApKyNJLQXvV4wj2XrTZrA\nl4l4vBjBJZJIMo6A2RVFhdnMIyJ/wJ2haWJY6nZKJMj94L9Q2FUzlvQvRAHbwQR/\nAAbjk1yhFaAeQbUXWnwnEZ0Le4eZKXlcZ/ryRKvCo+VP6et5WonWOSjInAVEwxB1\nZ98Nd6RW97zXDAfENLX2sN6+NDA2x+O1H38VljDfu/hu5YjBzdpRi9w21F9vopgd\n86zYV5NOYwYBI198RJD7HP8bNaBol48hoeB

In [28]:
# The actual result of the algorithm
result[0]['result']