# Fed-BioMed Researcher base example

Use for developing (autoreloads changes made across packages)

In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
from fedbiomed.researcher.requests import Requests 
req = Requests()
req.list(verbose=True)


2023-11-17 11:55:06,398 fedbiomed INFO - Starting researcher service...
2023-11-17 11:55:06,399 fedbiomed INFO - Waiting 3s for nodes to connect...
2023-11-17 11:55:07,241 fedbiomed DEBUG - Node: node_d8872b90-10fc-4fe9-9e0e-250778d4d6c4 polling for the tasks
2023-11-17 11:55:09,409 fedbiomed DEBUG - Node: node_d8872b90-10fc-4fe9-9e0e-250778d4d6c4 polling for the tasks
2023-11-17 11:55:09,418 fedbiomed INFO - 
 Node: node_d8872b90-10fc-4fe9-9e0e-250778d4d6c4 | Number of Datasets: 1 
+--------+-------------+------------------------+----------------+--------------------+----------------------------------------------+----------------------+
| name   | data_type   | tags                   | description    | shape              | dataset_id                                   | dataset_parameters   |
| MNIST  | default     | ['#MNIST', '#dataset'] | MNIST database | [60000, 1, 28, 28] | dataset_837496a7-dddc-4265-8f9b-5f2e2d4674b5 |                      |
+--------+-------------+--------------

{'node_d8872b90-10fc-4fe9-9e0e-250778d4d6c4': [{'name': 'MNIST',
   'data_type': 'default',
   'tags': ['#MNIST', '#dataset'],
   'description': 'MNIST database',
   'shape': [60000, 1, 28, 28],
   'dataset_id': 'dataset_837496a7-dddc-4265-8f9b-5f2e2d4674b5',
   'dataset_parameters': None}]}

In [2]:
req.search(tags=[])

2023-11-17 11:55:09,451 fedbiomed DEBUG - Node: node_d8872b90-10fc-4fe9-9e0e-250778d4d6c4 polling for the tasks
2023-11-17 11:55:09,452 fedbiomed INFO - Node selected for training -> node_d8872b90-10fc-4fe9-9e0e-250778d4d6c4


{'node_d8872b90-10fc-4fe9-9e0e-250778d4d6c4': [{'name': 'MNIST',
   'data_type': 'default',
   'tags': ['#MNIST', '#dataset'],
   'description': 'MNIST database',
   'shape': [60000, 1, 28, 28],
   'dataset_id': 'dataset_837496a7-dddc-4265-8f9b-5f2e2d4674b5',
   'dtypes': [],
   'dataset_parameters': None}]}

In [11]:
req.training_plan_approve(MyTrainingPlan, "testTP")

2023-11-17 11:57:33,791 fedbiomed DEBUG - Model file has been saved: /home/mvesin/GIT/fedbiomed/fedbiomed/var/tmp/tmp0wxrhlvk/model_c4c249fd-bb75-41ec-8b51-4f1e460acf1a.py
2023-11-17 11:57:33,800 fedbiomed INFO - Node node_aea3f6a8-4b68-4791-96a2-0a956c181ba6 is in WAITING status. Server is waiting for receiving a request from this node to convert it as ACTIVE. Node will be updated as DISCONNECTED soon if no request received.
2023-11-17 11:57:33,824 fedbiomed DEBUG - Node: node_d8872b90-10fc-4fe9-9e0e-250778d4d6c4 polling for the tasks
2023-11-17 11:57:35,837 fedbiomed DEBUG - Node: node_aea3f6a8-4b68-4791-96a2-0a956c181ba6 polling for the tasks
2023-11-17 11:57:35,856 fedbiomed DEBUG - Node: node_aea3f6a8-4b68-4791-96a2-0a956c181ba6 polling for the tasks


{'node_d8872b90-10fc-4fe9-9e0e-250778d4d6c4': True,
 'node_aea3f6a8-4b68-4791-96a2-0a956c181ba6': True}

## Setting the node up
It is necessary to previously configure a node:
1. `./scripts/fedbiomed_run node add`
  * Select option 2 (default) to add MNIST to the node
  * Confirm default tags by hitting "y" and ENTER
  * Pick the folder where MNIST is downloaded (this is due to a pytorch issue https://github.com/pytorch/vision/issues/3549)
  * Data must have been added (if you get a warning saying that data must be unique is because it's been already added)
  
2. Check that your data has been added by executing `./scripts/fedbiomed_run node list`
3. Run the node using `./scripts/fedbiomed_run node start`. Wait until you get `Starting task manager`. it means you are online.

## Define an experiment model and parameters"

Declare a torch training plan MyTrainingPlan class to send for training on the node

In [3]:
import torch
import torch.nn as nn
from fedbiomed.common.training_plans import TorchTrainingPlan
from fedbiomed.common.data import DataManager
from torchvision import datasets, transforms


# Here we define the model to be used. 
# You can use any class name (here 'Net')
class MyTrainingPlan(TorchTrainingPlan):
    
    # Defines and return model 
    def init_model(self, model_args):
        return self.Net(model_args = model_args)
    
    # Defines and return optimizer
    def init_optimizer(self, optimizer_args):
        return torch.optim.Adam(self.model().parameters(), lr = optimizer_args["lr"])
    
    # Declares and return dependencies
    def init_dependencies(self):
        deps = ["from torchvision import datasets, transforms"]
        return deps
    
    class Net(nn.Module):
        def __init__(self, model_args):
            super().__init__()
            self.conv1 = nn.Conv2d(1, 32, 3, 1)
            self.conv2 = nn.Conv2d(32, 64, 3, 1)
            self.dropout1 = nn.Dropout(0.25)
            self.dropout2 = nn.Dropout(0.5)
            self.fc1 = nn.Linear(9216, 128)
            self.fc2 = nn.Linear(128, 10)

        def forward(self, x):
            x = self.conv1(x)
            x = F.relu(x)
            x = self.conv2(x)
            x = F.relu(x)
            x = F.max_pool2d(x, 2)
            x = self.dropout1(x)
            x = torch.flatten(x, 1)
            x = self.fc1(x)
            x = F.relu(x)
            x = self.dropout2(x)
            x = self.fc2(x)


            output = F.log_softmax(x, dim=1)
            return output

    def training_data(self):
        # Custom torch Dataloader for MNIST data
        transform = transforms.Compose([transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))])
        dataset1 = datasets.MNIST(self.dataset_path, train=True, download=False, transform=transform)
        train_kwargs = { 'shuffle': True}
        return DataManager(dataset=dataset1, **train_kwargs)
    
    def training_step(self, data, target):
        output = self.model().forward(data)
        loss   = torch.nn.functional.nll_loss(output, target)
        return loss


This group of arguments correspond respectively:
* `model_args`: a dictionary with the arguments related to the model (e.g. number of layers, features, etc.). This will be passed to the model class on the node side.
* `training_args`: a dictionary containing the arguments for the training routine (e.g. batch size, learning rate, epochs, etc.). This will be passed to the routine on the node side.

**NOTE:** typos and/or lack of positional (required) arguments will raise error. 🤓

In [12]:
model_args = {}

training_args = {
    'loader_args': { 'batch_size': 48, }, 
    'optimizer_args': {
        "lr" : 1e-3
    },
    'epochs': 1, 
    'dry_run': False,  
    'batch_maxnum': 100 # Fast pass for development : only use ( batch_maxnum * batch_size ) samples
}

## Declare and run the experiment

- search nodes serving data for these `tags`, optionally filter on a list of node ID with `nodes`
- run a round of local training on nodes with model defined in `model_path` + federation with `aggregator`
- run for `round_limit` rounds, applying the `node_selection_strategy` between the rounds

In [13]:
from fedbiomed.researcher.experiment import Experiment
from fedbiomed.researcher.aggregators.fedavg import FedAverage

tags =  ['#MNIST', '#dataset']
rounds = 2

exp = Experiment(tags=tags,
                 model_args=model_args,
                 training_plan_class=MyTrainingPlan,
                 training_args=training_args,
                 round_limit=rounds,
                 aggregator=FedAverage(),
                 node_selection_strategy=None)

2023-11-17 11:57:50,567 fedbiomed DEBUG - Node: node_d8872b90-10fc-4fe9-9e0e-250778d4d6c4 polling for the tasks
2023-11-17 11:57:50,569 fedbiomed DEBUG - Node: node_aea3f6a8-4b68-4791-96a2-0a956c181ba6 polling for the tasks
2023-11-17 11:57:50,570 fedbiomed INFO - Node selected for training -> node_d8872b90-10fc-4fe9-9e0e-250778d4d6c4
2023-11-17 11:57:50,571 fedbiomed INFO - Node selected for training -> node_aea3f6a8-4b68-4791-96a2-0a956c181ba6
2023-11-17 11:57:50,573 fedbiomed DEBUG - Model file has been saved: /home/mvesin/GIT/fedbiomed/fedbiomed/var/experiments/Experiment_0003/model_abb1574d-dc2c-443a-b06e-fdc0bc1fe27a.py
Secure RNG turned off. This is perfectly fine for experimentation as it allows for much faster training performance, but remember to turn it on and retrain one last time before production with ``secure_mode`` turned on.
2023-11-17 11:57:50,586 fedbiomed DEBUG - using native torch optimizer


Let's start the experiment.

By default, this function doesn't stop until all the `round_limit` rounds are done for all the nodes

In [14]:
exp.info()

Arguments             Values
--------------------  ------------------------------------------------------------
Tags                  ['#MNIST', '#dataset']
Nodes filter          None
Training Data         <fedbiomed.researcher.datasets.FederatedDataSet object at 0x
                      7f4b8861ffd0>
Aggregator            FedAverage
Strategy              <fedbiomed.researcher.strategies.default_strategy.DefaultStr
                      ategy object at 0x7f4b88648160>
Job                   <fedbiomed.researcher.job.Job object at 0x7f4b8861ffa0>
Aggregator Optimizer  None
Training Plan Class   <class '__main__.MyTrainingPlan'>
Model Arguments       {}
Training Arguments    {'loader_args': {'batch_size': 48}, 'optimizer_args': {'lr':
                       0.001}, 'epochs': 1, 'dry_run': False, 'batch_maxnum': 100,
                       'num_updates': None, 'test_ratio': 0.0, 'test_on_local_upda
                      tes': False, 'test_on_global_updates': False, 'test_metric':
         

In [17]:
exp.training_plan_approve(MyTrainingPlan, "TEST")

2023-11-17 11:58:36,963 fedbiomed DEBUG - Model file has been saved: /home/mvesin/GIT/fedbiomed/fedbiomed/var/tmp/tmpdxaw792f/model_84981f45-1f0c-4b1a-b716-0508b981d310.py
2023-11-17 11:58:36,975 fedbiomed INFO - Node node_aea3f6a8-4b68-4791-96a2-0a956c181ba6 is in WAITING status. Server is waiting for receiving a request from this node to convert it as ACTIVE. Node will be updated as DISCONNECTED soon if no request received.
2023-11-17 11:58:36,996 fedbiomed DEBUG - Node: node_d8872b90-10fc-4fe9-9e0e-250778d4d6c4 polling for the tasks


{'node_d8872b90-10fc-4fe9-9e0e-250778d4d6c4': True,
 'node_aea3f6a8-4b68-4791-96a2-0a956c181ba6': False}

In [None]:
exp.run()

Local training results for each round and each node are available via `exp.training_replies()` (index 0 to (`rounds` - 1) ).

For example you can view the training results for the last round below.

Different timings (in seconds) are reported for each dataset of a node participating in a round :
- `rtime_training` real time (clock time) spent in the training function on the node
- `ptime_training` process time (user and system CPU) spent in the training function on the node
- `rtime_total` real time (clock time) spent in the researcher between sending the request and handling the response, at the `Job()` layer

In [None]:
print("\nList the training rounds : ", exp.training_replies().keys())

print("\nList the nodes for the last training round and their timings : ")
round_data = exp.training_replies()[rounds - 1]
for r in round_data.values():
    print("\t- {id} :\
    \n\t\trtime_training={rtraining:.2f} seconds\
    \n\t\tptime_training={ptraining:.2f} seconds\
    \n\t\trtime_total={rtotal:.2f} seconds".format(id = r['node_id'],
        rtraining = r['timing']['rtime_training'],
        ptraining = r['timing']['ptime_training'],
        rtotal = r['timing']['rtime_total']))
print('\n')

Federated parameters for each round are available via `exp.aggregated_params()` (index 0 to (`rounds` - 1) ).

For example you can view the federated parameters for the last round of the experiment :

In [None]:
print("\nList the training rounds : ", exp.aggregated_params().keys())

print("\nAccess the federated params for the last training round :")
print("\t- params_path: ", exp.aggregated_params()[rounds - 1]['params_path'])
print("\t- parameter data: ", exp.aggregated_params()[rounds - 1]['params'].keys())


Feel free to run other sample notebooks or try your own models :D