In [1]:
from copy import deepcopy
import numpy as np

from datasets.load import load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchtext

In [2]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [3]:
# imdb_dataset = load_dataset('imdb', split=['train', 'test']) # Get the dataset from huggingface library
train_imdb_dataset, test_imdb_dataset = torchtext.datasets.IMDB() # Get the dataset from torchtext library

Preparativos como los embeddings, el vocabulario, etc

In [33]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe, FastText, vocab
import spacy

In [15]:
glove = GloVe(name='6B', dim=100)
fasttext = FastText(language='en')
vocabulary = vocab(glove.stoi)
vocabulary_fasttext = vocab(fasttext.stoi)

.vector_cache/wiki.en.vec: 6.60GB [05:04, 21.7MB/s]                                
  0%|          | 0/2519370 [00:00<?, ?it/s]Skipping token b'2519370' with 1-dimensional vector [b'300']; likely a header
100%|██████████| 2519370/2519370 [02:07<00:00, 19766.43it/s]


In [6]:
vocab_size = len(vocabulary)
print(glove.vectors.shape)

torch.Size([400000, 100])


In [7]:
vocabulary.get_itos()[0:10]

[',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s", 'for']

In [22]:
example = "This is an example sentence to test the tokenizer."
tokenizer = get_tokenizer("basic_english")
spacy_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
example_tokens = tokenizer(example)
example_tokens_spacy = spacy_tokenizer(example)

In [23]:
print(f"Basic English Tokenizer: {example_tokens}")
print(f"Spacy Tokenizer: {example_tokens_spacy}")

Basic English Tokenizer: ['this', 'is', 'an', 'example', 'sentence', 'to', 'test', 'the', 'tokenizer', '.']
Spacy Tokenizer: ['This', 'is', 'an', 'example', 'sentence', 'to', 'test', 'the', 'tokenizer', '.']


In [24]:
type(example_tokens)

list

In [25]:
# Remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

print(f"Example tokens: {example_tokens_spacy}")

example_tokens_spacy = [word.lower() for word in example_tokens_spacy if word not in stop_words]

print(f"Example tokens: {example_tokens_spacy}")

Example tokens: ['This', 'is', 'an', 'example', 'sentence', 'to', 'test', 'the', 'tokenizer', '.']
Example tokens: ['this', 'example', 'sentence', 'test', 'tokenizer', '.']


In [32]:
vocabulary_fasttext['tokenizer']

656502

In [19]:
vocabulary_fasttext['This']

RuntimeError: Token This not found and default index is not set

In [18]:
vocabulary_fasttext['this']

25

In [12]:
vocabulary.get_stoi()['This']

KeyError: 'This'

# From centralized data to federated data

First we're going to federate the dataset using the FedDataDristibution class, that has functions to load multiple datasets from deep learning libraries such as PyTorch or TensorFlow. In this notebook we are using PyTorch, so we need to use the functions from the PyTorch ecosystem, and for the text datasets, we need to use the function `from_config_with_torchtext_dataset`.

In [13]:
from flex.data import FedDatasetConfig, FedDataDistribution

config = FedDatasetConfig(seed=0)
config.n_clients = 2
config.replacement = False # ensure that clients do not share any data
config.client_names = ['client1', 'client2'] # Optional
flex_dataset = FedDataDistribution.from_config_with_torchtext_dataset(data=train_imdb_dataset, config=config)



We may also want to use the FLEXible dataset for the test data, so we just use da function `from_torchtext_dataset` in the Dataset class.

In [15]:
from flex.data import Dataset

test_dataset = Dataset.from_torchtext_dataset(test_imdb_dataset)



# 2) Federate a model with FLEXible.

Once we've federated the dataset, it's time to create the FlexPool. The FlexPool class is the one that simulates the real-time scenario for federated learning, so it is in charge of the communications across actors. 

In [17]:
from flex.model import FlexModel
from flex.pool import FlexPool

from flex.pool.decorators import init_server_model
from flex.pool.decorators import deploy_server_model

In this notebook we are going to simulate a client-server architecture, which we can easily build using the FlexPool class, using the function `client_server_architecture`. This function needs a FlexDataset, which we already have prepared, and a function to initialize the server model, which we have to create.

The model we are going to use is a simple LSTM, which will have the embeddings, the LSTM, a Linear layer and the output layer.

In [38]:
class LSTMNet(nn.Module):
    def __init__(self, hidden_size, num_classes, emb_vectors) -> None:
        super().__init__()
        # self.emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embeddings_dim)
        # self.emb.weight.data.copy_(emb_vectors) # Initialize the embedding layer with the pretrained embeddings
        # self.emb.requires_grad_(False) # Freeze the embedding layer
        # We can do the 3 steps above with the following line
        self.emb = nn.Embedding.from_pretrained(embeddings=emb_vectors, freeze=True)
        embeddings_dim = emb_vectors.shape[1]
        self.lstm = nn.LSTM(input_size=embeddings_dim, hidden_size=hidden_size, num_layers=1)
        self.fc1 = nn.Linear(hidden_size, hidden_size//2)
        self.fc2 = nn.Linear(hidden_size//2, num_classes)

    def forward(self, x):
        x = self.emb(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


@init_server_model
def build_server_model():
    pass
