In [None]:
from copy import deepcopy
import numpy as np

from datasets.load import load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset as TorchDataset
import torchtext

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [None]:
# imdb_dataset = load_dataset('imdb', split=['train', 'test']) # Get the dataset from huggingface library
train_imdb_dataset, test_imdb_dataset = torchtext.datasets.IMDB() # Get the dataset from torchtext library

Preparativos como los embeddings, el vocabulario, etc

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe, FastText, vocab

In [None]:
embeddings_dim = 100 # Dimension of the embeddings
glove = GloVe(name='6B', dim=embeddings_dim) # Load GloVe embeddings with 100 dimensions.
# fasttext = FastText(language='en') # To use FastText instead of GloVe
vocabulary = vocab(glove.stoi)
# vocabulary_fasttext = vocab(fasttext.stoi) # To use FastText instead of GloVe
vocab_size = len(vocabulary) # Get the vocabulary size
print(f"Total vocabulary size: {vocab_size}")
print(f"Shape of embeddings: {glove.vectors.shape}")

In [None]:
example = "This is an example sentence to test the tokenizer."
tokenizer = get_tokenizer("basic_english")
spacy_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
example_tokens = tokenizer(example)
example_tokens_spacy = spacy_tokenizer(example)

In [None]:
vocabulary.get_itos()[:10] # Get the first 10 words of the vocabulary

In [None]:
print(f"Padding token idx, pad: {vocabulary.get_stoi()['pad']}") # Get the index of the word 'pad' for padding
print(f"Unknown token idx, unk: {vocabulary.get_stoi()['unk']}") # Get the index of the word 'unk' for unknown words

We can use the basic english tokenizer from PyTorch, or the SpaCy tokenizer if we have spacy downloaded. Here we probe both tokenizer with the same example sentence.

In [None]:
print(f"Basic English Tokenizer: {example_tokens}")
print(f"Spacy Tokenizer: {example_tokens_spacy}")

Client's will probably want to delete the stopwords, optional, as the embeddings may have vectors for most of the stopwords. Here we show multiple options show the user must decide what he prefers to use. In this notebook we're going to use the first case, as it will have most information. In other case, we would use the last one, so at least we keep the most information we can. 

Later we will have to tokenize the clients data, and then we will add the padding to the sequences, and will convert the token to the index of the embedding matrix (ids).

In [None]:
# Remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

print(f"Example tokens tokenized: {[word.lower() for word in example_tokens_spacy]}")

print(f"Example tokens without stopwords: {[word.lower() for word in example_tokens_spacy if word not in stop_words]}")

print(f"Example tokens without stopwords and word in vocabulary: {[word.lower() for word in example_tokens_spacy if word not in stop_words and word.lower() in vocabulary]}")

print(f"Example tokens without quitting stopwords and word in vocabulary: {[word.lower() for word in example_tokens_spacy if word.lower() in vocabulary]}")

# From centralized data to federated data

First we're going to federate the dataset using the FedDataDristibution class, that has functions to load multiple datasets from deep learning libraries such as PyTorch or TensorFlow. In this notebook we are using PyTorch, so we need to use the functions from the PyTorch ecosystem, and for the text datasets, we need to use the function `from_config_with_torchtext_dataset`.

In [None]:
from flex.data import FedDatasetConfig, FedDataDistribution

config = FedDatasetConfig(seed=0)
config.n_clients = 2
config.replacement = False # ensure that clients do not share any data
config.client_names = ['client1', 'client2'] # Optional
flex_dataset = FedDataDistribution.from_config_with_torchtext_dataset(data=train_imdb_dataset, config=config)

We may also want to use the FLEXible dataset for the test data, so we just use da function `from_torchtext_dataset` in the Dataset class.

In [None]:
from flex.data import Dataset

test_dataset = Dataset.from_torchtext_dataset(test_imdb_dataset)

# 2) Federate a model with FLEXible.

Once we've federated the dataset, it's time to create the FlexPool. The FlexPool class is the one that simulates the real-time scenario for federated learning, so it is in charge of the communications across actors. 

In [None]:
from flex.model import FlexModel
from flex.pool import FlexPool

from flex.pool.decorators import init_server_model
from flex.pool.decorators import deploy_server_model

In this notebook we are going to simulate a client-server architecture, which we can easily build using the FlexPool class, using the function `client_server_architecture`. This function needs a FlexDataset, which we already have prepared, and a function to initialize the server model, which we have to create.

The model we are going to use is a simple LSTM, which will have the embeddings, the LSTM, a Linear layer and the output layer.

In [None]:
class LSTMNet(nn.Module):
    def __init__(self, hidden_size, num_classes, emb_vectors) -> None:
        super().__init__()
        # self.emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embeddings_dim)
        # self.emb.weight.data.copy_(emb_vectors) # Initialize the embedding layer with the pretrained embeddings
        # self.emb.requires_grad_(False) # Freeze the embedding layer
        # We can do the 3 steps above with the following line
        self.emb = nn.Embedding.from_pretrained(embeddings=emb_vectors, freeze=True)
        embeddings_dim = emb_vectors.shape[1]
        self.lstm = nn.LSTM(input_size=embeddings_dim, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, hidden_size//2)
        self.fc2 = nn.Linear(hidden_size//2, num_classes)

    def forward(self, x):
        # print(f"Showing the shape of the input: {x.shape}")
        x = self.emb(x)
        # print(f"Showing the shape of the input after the embedding layer: {x.shape}")
        x, _ = self.lstm(x)
        x = F.relu(x[:, -1, :])
        # print(f"Showing the shape of the input after the LSTM layer: {x.shape}")
        x = F.relu(self.fc1(x))
        # print(f"Showing the shape of the input after the first linear layer: {x.shape}")
        x = self.fc2(x)
        # print(f"Showing the shape of the output shape: {x.shape}")
        return F.log_softmax(x, dim=1)


@init_server_model
def build_server_model():
    server_flex_model = FlexModel()

    server_flex_model['model'] = LSTMNet(hidden_size=128, num_classes=2, emb_vectors=glove.vectors)
    # Required to store this for later stages of the FL training process
    server_flex_model["criterion"] = torch.nn.CrossEntropyLoss()
    server_flex_model["optimizer_func"] = torch.optim.Adam
    server_flex_model["optimizer_kwargs"] = {}

    return server_flex_model

Once we've defined the function to initialize the server model, we can create the FlexPool using the function `client_server_architecture`.

In [None]:
flex_pool = FlexPool.client_server_architecture(fed_dataset=flex_dataset, init_func=build_server_model)

clients = flex_pool.clients
servers = flex_pool.servers
aggregators = flex_pool.aggregators

print(f"Number of nodes in the pool {len(flex_pool)}: {len(servers)} server plus {len(clients)} clients. The server is also an aggregator")

We can use the decorator `deploy_server_model` to create a custom function that deploys our server model, or we can use the primitive `deploy_server_model_pt` to deploy the server model to the clients.

In [None]:
from flex.pool import deploy_server_model, deploy_server_model_pt

@deploy_server_model
def copy_server_model_to_clients(server_flex_model: FlexModel):
    return deepcopy(server_flex_model)

In [None]:
servers.map(copy_server_model_to_clients, clients) # Using the function created with the decorator
# servers.map(deploy_server_model_pt, clients) # Using the primitive function

As text needs to be preprocessed and batched on the clients, we can do it on the train function. 

In [None]:
import re
import random

from tqdm import tqdm

from torch.nn.utils.rnn import pad_sequence

BATCH_SIZE = 64
NUM_EPOCHS = 10

def clean_str(string):
    """
    Tokenization/string cleaning.
    Original from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    
    return string.strip().lower()

def preprocess_text(text):
    return list(spacy_tokenizer(clean_str(text)))

def convert_token_to_idx(text_tokenized):
    return [vocabulary[token] if token in vocabulary else vocabulary['unk'] for token in text_tokenized]

def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(convert_token_to_idx(preprocess_text(_text)))
        text_list.append(processed_text)
    return torch.tensor(label_list), pad_sequence(text_list, padding_value=vocabulary['pad'], batch_first=True)

def train(client_flex_model: FlexModel, client_data: Dataset):
    X_data, y_data = client_data.to_list()
    client_dataloader = DataLoader(client_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
    model = client_flex_model["model"]
    optimizer = client_flex_model['optimizer_func'](model.parameters(), **client_flex_model["optimizer_kwargs"])
    model = model.train()
    model = model.to(device)
    criterion = client_flex_model["criterion"]
    for _ in tqdm(range(NUM_EPOCHS)):
        for labels, texts in client_dataloader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            pred = model(texts)
            loss = criterion(pred, labels)
            loss.backward()
            optimizer.step()

In [None]:
clients.map(train)

After training the model, we have to aggregate the weights from the clients model in order to update the global model. To to so, we are going to use the primitive `collect_clients_weights_pt`.

In [None]:
from flex.pool import collect_clients_weights_pt

aggregators.map(collect_clients_weights_pt, clients)

Once the weights are aggregated, we aggregate them. In this notebook we use the FedAvg method that is already implemented in FLEXible.

In [None]:
from flex.pool import fed_avg

aggregators.map(fed_avg)

The function `set_aggregated_weights_pt` sed the aggregated weights to the server model to update it.

In [None]:
from flex.pool import set_aggregated_weights_pt

aggregators.map(set_aggregated_weights_pt, servers)

Now it's turn to evaluate the global model. To do so, we have to create a function using the decoratod `evaluate_server_model`.

In [None]:
from flex.pool import evaluate_server_model

@evaluate_server_model
def evaluate_global_model(server_flex_model: FlexModel, test_data=None):
    model = server_flex_model["model"]
    model.eval()
    test_loss = 0
    test_acc = 0
    total_count = 0
    model = model.to(device)
    criterion=server_flex_model['criterion']
    # get test data as a torchvision object
    test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=True, pin_memory=False, collate_fn=collate_batch)
    losses = []
    with torch.no_grad():
        for target, data in test_dataloader:
            total_count += target.size(0)
            data, target = data.to(device), target.to(device)
            output = model(data)
            losses.append(criterion(output, target).item())
            pred = output.data.max(1, keepdim=True)[1]
            test_acc += pred.eq(target.data.view_as(pred)).long().cpu().sum().item()

    test_loss = sum(losses) / len(losses)
    test_acc /= total_count
    return test_loss, test_acc

In [None]:
metrics = servers.map(evaluate_global_model, test_data=test_dataset)

In [None]:
metrics[0]