# Fedbiomed Researcher

Use for developing (autoreloads changes made across packages)

In [1]:
%load_ext autoreload
%autoreload 2

## Setting the client up

Install the celeba dataset with the help of the readme.md file inside `notebooks/data_for_examples/Celeba`  
For the sake of testing the resulting model, this file uses the data from node 1 and 2 for training and the data from node 3 to test.
you can create multiple nodes by adding a config parameter to the command contriling nodes, for example :  
creating 2 nodes for training :  
 - `./scripts/fedbiomed_run node start config node1.ini`
 - `./scripts/fedbiomed_run node start config node2.ini`  
 
adding data for each node :  
 - `./scripts/fedbiomed_run node add config node1.ini`
 - `./scripts/fedbiomed_run node add config node2.ini`

It is necessary to previously configure at least a node:
1. `./scripts/fedbiomed_run node add config (ini file)`
  * Select option 3 (images) to add an image dataset to the node
  * Add a name and the tag for the dataset (tag should contain '#celeba' as it is the tag used for this training) and finaly add the description
  * Pick a data folder from the 3 generated inside data/Celeba/celeba_preprocessed
  * Data must have been added (if you get a warning saying that data must be unique is because it's been already added)
  
2. Check that your data has been added by executing `./scripts/fedbiomed_run node list`
3. Run the node using `./scripts/fedbiomed_run node start`. Wait until you get `Connected with result code 0`. it means you are online.

for the sake of testing the resulting model, only nodes 1 and 2 were started during training, datas from node 3 is used to test the model.

## Create an experiment to train a model on the data found

Declare a torch.nn Net class to send for training on the node

In [2]:
from fedbiomed.researcher.environ import TMP_DIR
import tempfile
tmp_dir_model = tempfile.TemporaryDirectory(dir=TMP_DIR+'/')
model_file = tmp_dir_model.name + '/CelebaClass.py'

Note : write **only** the code to export in the following cell

In [3]:
%%writefile "$model_file"

import torch
import torch.nn as nn
from   fedbiomed.common.torchnn import TorchTrainingPlan
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from PIL import Image
import os


class Net(TorchTrainingPlan):
    def __init__(self):
        super(Net, self).__init__()
        #convolution layers
        self.conv1 = nn.Conv2d(3, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 32, 3, 1)
        self.conv3 = nn.Conv2d(32, 32, 3, 1)
        self.conv4 = nn.Conv2d(32, 32, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        # classifier
        self.fc1 = nn.Linear(3168, 128)
        self.fc2 = nn.Linear(128, 2)
        
        # Here we define the custom dependencies that will be needed by our custom Dataloader
        deps = ["from torch.utils.data import Dataset, DataLoader",
                "from torchvision import transforms",
                "import pandas as pd",
               "from PIL import Image",
               "import os",
               "import numpy as np"]
        self.add_dependency(deps)

    def forward(self, x):
        x = self.conv1(x)
        x = F.max_pool2d(x, 2)
        x = F.relu(x)
        
        x = self.conv2(x)
        x = F.max_pool2d(x, 2)
        x = F.relu(x)

        x = self.conv3(x)
        x = F.max_pool2d(x, 2)
        x = F.relu(x)

        x = self.conv4(x)
        x = F.max_pool2d(x, 2)
        x = F.relu(x)
        
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output


    class CelebaDataset(Dataset):
        """Custom Dataset for loading CelebA face images"""
        
        # we dont load the full data of the images, we retrieve the image with the get item. 
        # in our case, each image is 218*178 * 3colors. there is 67533 images. this take at leas 7G of ram
        # loading images when needed takes more time during training but it wont impact the ram usage as much as loading everything
        def __init__(self, txt_path, img_dir, transform=None):
            df = pd.read_csv(txt_path, sep="\t", index_col=0)
            self.img_dir = img_dir
            self.txt_path = txt_path
            self.img_names = df.index.values
            self.y = df['Smiling'].values
            self.transform = transform
            print("celeba dataset finished")

        def __getitem__(self, index):
            img = np.asarray(Image.open(os.path.join(self.img_dir,
                                        self.img_names[index])))
            img = transforms.ToTensor()(img)
            label = self.y[index]
            return img, label

        def __len__(self):
            return self.y.shape[0]
    
    def training_data(self,  batch_size = 48):
    # The training_data creates the Dataloader to be used for training in the general class Torchnn of fedbiomed
        dataset = self.CelebaDataset(self.dataset_path + "/target.csv", self.dataset_path + "/data/")
        train_kwargs = {'batch_size': batch_size, 'shuffle': True}
        data_loader = DataLoader(dataset, **train_kwargs)
        return data_loader
    
    def training_step(self, data, target):
        #this function must return the loss to backward it 
        output = self.forward(data)
        loss   = torch.nn.functional.nll_loss(output, target)
        return loss


Writing /home/administrator/fedbiomed/fedbiomed/var/tmp/tmp6oqnzgqv/CelebaClass.py


This group of arguments correspond respectively:
* `model_args`: a dictionary with the arguments related to the model (e.g. number of layers, features, etc.). This will be passed to the model class on the client side.
* `training_args`: a dictionary containing the arguments for the training routine (e.g. batch size, learning rate, epochs, etc.). This will be passed to the routine on the client side.

**NOTE:** typos and/or lack of positional (required) arguments will raise error. 🤓

In [4]:
training_args = {
    'batch_size': 32, 
    'lr': 1e-3, 
    'epochs': 1, 
    'dry_run': False,  
    'batch_maxnum': 200 # Fast pass for development : only use ( batch_maxnum * batch_size ) samples
}

# Train the federated model

In [5]:
from fedbiomed.researcher.experiment import Experiment
from fedbiomed.researcher.aggregators.fedavg import FedAverage

tags =  ['#celeba']
rounds = 5

exp = Experiment(tags=tags,
                 model_path=model_file,
                 model_class='Net',
                 training_args=training_args,
                 rounds=rounds,
                 aggregator=FedAverage(),
                 client_selection_strategy=None)

Messaging researcher_b85a3870-ab99-43f0-b35f-40af022c3525 connected with result code 0
Searching for clients with data tags: ['#celeba'] ...
2021-08-27 14:03:51.018583 [ RESEARCHER ] message received. {'researcher_id': 'researcher_b85a3870-ab99-43f0-b35f-40af022c3525', 'success': True, 'databases': [{'name': 'celeba', 'data_type': 'images', 'tags': ['#celeba'], 'description': 'celeba node 2', 'shape': [67533, 3, 218, 178], 'dataset_id': 'dataset_63034967-a24e-41d3-b3f6-52d066dd679b'}], 'count': 1, 'client_id': 'client_9434ffdf-1bd5-49bf-8d47-5bc86602ee2a', 'command': 'search'}
2021-08-27 14:03:51.019528 [ RESEARCHER ] message received. {'researcher_id': 'researcher_b85a3870-ab99-43f0-b35f-40af022c3525', 'success': True, 'databases': [{'name': 'celeba', 'data_type': 'images', 'tags': ['#celeba'], 'description': 'celeba node 1', 'shape': [67533, 3, 218, 178], 'dataset_id': 'dataset_1f2d7ae1-4e17-4e23-be80-58d60d0c6aa6'}], 'count': 1, 'client_id': 'client_cb657f6d-86d5-46b3-8c0b-aa9055263

Define an experiment
- search nodes serving data for these `tags`, optionally filter on a list of client ID with `clients`
- run a round of local training on nodes with model defined in `model_path` + federation with `aggregator`
- run for `rounds` rounds, applying the `client_selection_strategy` between the rounds

Let's start the experiment.

By default, this function doesn't stop until all the `rounds` are done for all the clients

In [6]:
exp.run()

Sampled clients in round  0   ['client_9434ffdf-1bd5-49bf-8d47-5bc86602ee2a', 'client_cb657f6d-86d5-46b3-8c0b-aa90552631ae']
[ RESEARCHER ] Send message to client  client_9434ffdf-1bd5-49bf-8d47-5bc86602ee2a {'researcher_id': 'researcher_b85a3870-ab99-43f0-b35f-40af022c3525', 'job_id': 'a9378749-ac37-4923-a8b5-1a625c04bd0a', 'training_args': {'batch_size': 32, 'lr': 0.001, 'epochs': 1, 'dry_run': False, 'batch_maxnum': 200}, 'model_args': {}, 'command': 'train', 'model_url': 'http://localhost:8844/media/uploads/2021/08/27/my_model_ZZL7kcZ.py', 'params_url': 'http://localhost:8844/media/uploads/2021/08/27/my_model_zglhY27.pt', 'model_class': 'Net', 'training_data': {'client_9434ffdf-1bd5-49bf-8d47-5bc86602ee2a': ['dataset_63034967-a24e-41d3-b3f6-52d066dd679b']}}
researcher_b85a3870-ab99-43f0-b35f-40af022c3525
[ RESEARCHER ] Send message to client  client_cb657f6d-86d5-46b3-8c0b-aa90552631ae {'researcher_id': 'researcher_b85a3870-ab99-43f0-b35f-40af022c3525', 'job_id': 'a9378749-ac37-492

2021-08-27 14:10:21.760170 [ RESEARCHER ] message received. {'researcher_id': 'researcher_b85a3870-ab99-43f0-b35f-40af022c3525', 'job_id': 'a9378749-ac37-4923-a8b5-1a625c04bd0a', 'success': True, 'client_id': 'client_cb657f6d-86d5-46b3-8c0b-aa90552631ae', 'dataset_id': 'dataset_1f2d7ae1-4e17-4e23-be80-58d60d0c6aa6', 'params_url': 'http://localhost:8844/media/uploads/2021/08/27/node_params_c9bbe5a7-fe86-4121-8717-2156965a6f4d.pt', 'timing': {'rtime_training': 99.65200744199683, 'ptime_training': 573.8139453859999}, 'msg': '', 'command': 'train'}
Downloading model params after training on  client_9434ffdf-1bd5-49bf-8d47-5bc86602ee2a 
	- from http://localhost:8844/media/uploads/2021/08/27/node_params_df46511c-5fc1-4579-a04c-8d6f183a39ea.pt
Downloading model params after training on  client_cb657f6d-86d5-46b3-8c0b-aa90552631ae 
	- from http://localhost:8844/media/uploads/2021/08/27/node_params_c9bbe5a7-fe86-4121-8717-2156965a6f4d.pt
Clients that successfully reply in round  2   ['client_94

Retrieve the federated model parameters

In [9]:
fed_model = exp.model_instance
fed_model.load_state_dict(exp.aggregated_params[rounds - 1]['params'])

<All keys matched successfully>

In [10]:
fed_model

Net(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv4): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (dropout1): Dropout(p=0.25, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=3168, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
)

# Test Model

We define a little testing routine to extract the accuracy metrics on the testing dataset
## Important
this is done to test the model because it is a devellopement environement  
in production, the data wont be accessible.

In [11]:

import torch
import torch.nn as nn

import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from PIL import Image
import os

def testing_Accuracy(model, data_loader):
    model.eval()
    test_loss = 0
    correct = 0
    device = 'cpu'

    correct = 0
    
    with torch.no_grad():
        for data, target in data_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
    
        pred = output.argmax(dim=1, keepdim=True)

    test_loss /= len(data_loader.dataset)
    accuracy = 100* correct/len(data_loader.dataset)

    return(test_loss, accuracy)

the test dataset is the data from the third node

In [12]:

test_dataset_path = "./data_for_examples/Celeba/celeba_preprocessed/data_node_3"

class CelebaDataset(Dataset):
        """Custom Dataset for loading CelebA face images"""

        def __init__(self, txt_path, img_dir, transform=None):
            df = pd.read_csv(txt_path, sep="\t", index_col=0)
            self.img_dir = img_dir
            self.txt_path = txt_path
            self.img_names = df.index.values
            self.y = df['Smiling'].values
            self.transform = transform
            print("celeba dataset finished")

        def __getitem__(self, index):
            img = np.asarray(Image.open(os.path.join(self.img_dir,
                                        self.img_names[index])))
            img = transforms.ToTensor()(img)
            label = self.y[index]
            return img, label

        def __len__(self):
            return self.y.shape[0]
    

dataset = CelebaDataset(test_dataset_path + "/target.csv", test_dataset_path + "/data/")
train_kwargs = {'batch_size': 64, 'shuffle': True}
data_loader = DataLoader(dataset, **train_kwargs)

celeba dataset finished


Loading the testing dataset and computing accuracy metrics for local and federated models

In [13]:
acc_federated = testing_Accuracy(fed_model, data_loader)

In [14]:
acc_federated[1]

89.6050819599307