In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

import flwr as fl
import torch
import math

  from .autonotebook import tqdm as notebook_tqdm
2023-12-02 00:29:05,047	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


## Data Load & Preprocessing

In [2]:
model_checkpoint = "distilgpt2"
datasets = load_dataset("imdb")
datasets = datasets.remove_columns("label")


# Splitting unsupervised dataset into train/val/test (80/10/10)
train_size = int(len(datasets["unsupervised"]) * 0.9)
val_size = int(len(datasets["unsupervised"]) * 0.05)
test_size = len(datasets["unsupervised"]) - train_size - val_size

datasets["train"] = datasets["unsupervised"].select(range(train_size))
datasets["validation"] = datasets["unsupervised"].select(range(train_size, train_size + val_size))
datasets["test"] = datasets["unsupervised"].select(range(train_size + val_size, train_size + val_size + test_size))

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)


def tokenize_function(examples):
    return tokenizer(examples["text"])


tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

In [4]:
# block_size = tokenizer.model_max_length
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

## Load Model

In [5]:
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-imdb",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=1,
    push_to_hub=False,
)

## Flwr Client

In [6]:
class LanguageModelClient(fl.client.NumPyClient):
	def __init__(self, model, trainer, train_dataset, eval_dataset):
		self.model = model
		self.trainer = trainer
		self.train_dataset = train_dataset
		self.eval_dataset = eval_dataset
		self.rounds = 0
	
	def get_parameters(self, config):
		# Convert PyTorch parameters to NumPy
		return [val.cpu().numpy() for _, val in self.model.state_dict().items()]

	def set_parameters(self, parameters):
		# Convert NumPy parameters to PyTorch and set for the model
		state_dict = {k: torch.Tensor(v) for k, v in zip(self.model.state_dict().keys(), parameters)}
		self.model.load_state_dict(state_dict, strict=True)

	def fit(self, parameters, config):
		# Set the provided parameters and then train the model
		self.set_parameters(parameters)
		self.trainer.train()
		self.trainer.save_model(f"{model_name}-finetuned-imdb-{self.rounds}")
		self.rounds += 1
		return self.get_parameters(config), len(self.train_dataset), {}

	def evaluate(self, parameters, config):
		# Set the provided parameters and then evaluate the model
		self.set_parameters(parameters)
		eval_result = self.trainer.evaluate(self.eval_dataset)
		return float(eval_result['eval_loss']), len(self.eval_dataset)

In [7]:
fl.client.start_numpy_client(
    server_address="localhost:8060", 
    client=LanguageModelClient(
        model,
        Trainer(
            model=model, args=training_args, train_dataset=lm_datasets["train"], eval_dataset=lm_datasets["validation"]
        ),
        lm_datasets["train"],
        lm_datasets["validation"],
    ),
)

INFO flwr 2023-12-02 00:29:12,525 | grpc.py:52 | Opened insecure gRPC connection (no certificates were passed)
DEBUG flwr 2023-12-02 00:29:12,528 | connection.py:42 | ChannelConnectivity.IDLE
DEBUG flwr 2023-12-02 00:29:12,530 | connection.py:42 | ChannelConnectivity.CONNECTING
DEBUG flwr 2023-12-02 00:29:12,533 | connection.py:42 | ChannelConnectivity.READY


Epoch,Training Loss,Validation Loss
1,3.95,3.873138


DEBUG flwr 2023-12-02 01:18:36,837 | connection.py:141 | gRPC channel closed


_MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
	status = StatusCode.RESOURCE_EXHAUSTED
	details = "Sent message larger than max (964089798 vs. 536870912)"
	debug_error_string = "UNKNOWN:Error received from peer ipv6:%5B::1%5D:8060 {created_time:"2023-12-02T01:18:36.746578273+09:00", grpc_status:8, grpc_message:"Sent message larger than max (964089798 vs. 536870912)"}"
>

--- 

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

In [None]:
eval_results_before = trainer.evaluate()
print(f"Perplexity before training: {math.exp(eval_results_before['eval_loss']):.2f}")

In [None]:
trainer.train()

trainer.save_model(f"{model_name}-finetuned-wikitext103_fin")
eval_results_after = trainer.evaluate()
print(f"Perplexity after training: {math.exp(eval_results_after['eval_loss']):.2f}")