In [1]:
import copy
import os
from tqdm import tqdm
import numpy as np
from math import floor

from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import DataCollatorForCompletionOnlyLM
from peft import get_peft_model, get_peft_model_state_dict, set_peft_model_state_dict, prepare_model_for_kbit_training

from utils import *
from utils.utils import default_evaluation, save_dataset_test

from dataclasses import dataclass, field, asdict
from typing import Optional
from transformers import HfArgumentParser, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig
import os
import json
from accelerate import Accelerator
import torch
from datetime import datetime, timedelta

from trl import SFTTrainer
from transformers import TrainerCallback

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
dataset_name = 'multitask'
model_name = 'HuggingFaceTB/SmolLM-135M'

In [4]:
dataset, dataset_test = get_dataset(dataset_name, None, 0.8)
dataset =      process_sft_dataset(dataset_name, dataset,      400000)
dataset_test = process_sft_dataset(dataset_name, dataset_test, 400000)

Using the latest cached version of the module from /home/gabriel.talasso/.cache/huggingface/modules/datasets_modules/datasets/Samsung--samsum/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e (last modified on Tue Mar 11 13:33:01 2025) since it couldn't be found locally at Samsung/samsum, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /home/gabriel.talasso/.cache/huggingface/modules/datasets_modules/datasets/Harvard--gigaword/cd40ed63b2731b6da1f321fba3f70998c2d7de54730f50e0c20f6ac37f323a40 (last modified on Tue Mar 11 13:37:41 2025) since it couldn't be found locally at Harvard/gigaword, or remotely on the Hugging Face Hub.


Dataset Splited into TRAIN: 71668 and TEST: 17917


In [5]:
local_datasets = []
tasks = ['boolq', 'gigaword', 'samsum', 'webnlg']
n_clients_in_cluster = 4 // len(tasks)

for i in range(4):
    task = tasks[i // n_clients_in_cluster]
    cluster_dataset = dataset.filter(lambda x: x['task'] == task)
    cluster_dataset = cluster_dataset.shuffle(seed=0)

    local_datasets.append(cluster_dataset.shard(n_clients_in_cluster, i % n_clients_in_cluster))

In [6]:
EVAL_SIZE = 1000

In [7]:
local_datasets_test = []
for i in range(4):
    task = tasks[i // n_clients_in_cluster]
    cluster_dataset_test = dataset_test.filter(lambda x: x['task'] == task)
    cluster_dataset_test = cluster_dataset_test.shuffle(seed=0)

    aux = cluster_dataset_test.shard(n_clients_in_cluster, i % n_clients_in_cluster)
    aux = aux.select(range(EVAL_SIZE))  # Limit to 1000 samples for testing
    local_datasets_test.append(aux)


In [8]:
quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
# Copy the model to each device
device_map = {"": Accelerator().local_process_index}
torch_dtype = torch.bfloat16

In [9]:
peft_config = LoraConfig(
        r=8,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

In [10]:
def get_model_and_tokenizer():

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map=device_map,
        trust_remote_code=True,
        torch_dtype=torch_dtype,
    )
    print(f"Model loaded from {model_name}")


    model = prepare_model_for_kbit_training(
                model, use_gradient_checkpointing=True
            )

    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

    model.enable_input_require_grads()

    # ===== Define the global and local models =====
    global_dict = copy.deepcopy(get_peft_model_state_dict(model))
    local_dict_list = [copy.deepcopy(global_dict) for i in range(4)]

    # ===== Define the tokenizer =====
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, padding_side="right")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.unk_token   # following vicuna

    if tokenizer.eos_token == tokenizer.unk_token or tokenizer.pad_token == tokenizer.eos_token:
        tokenizer.add_special_tokens({'pad_token': '<pad>'})
        print(f"Pad token is set to {tokenizer.pad_token}.")

    print('Special tokens:', tokenizer.special_tokens_map)
    model.resize_token_embeddings(len(tokenizer))

    return model, tokenizer

In [11]:
max_steps=50
num_train_epochs=1
num_rounds=10
eval_round="10,25,50,100"
batch_size=8
batch_size_eval=128
gradient_accumulation_steps=1
seq_length=1024
num_clients=8
sample_clients=8
lora_r=8
lora_alpha=16  # twice of lora_r
lr=5e-4

In [12]:
output_dir = 'output_centralized_tests50'
eval_batch_size = 64

In [13]:
training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=lr,
        logging_steps=100,
        num_train_epochs=num_train_epochs,
        max_steps=max_steps,
        report_to=None,
        save_steps=1000,
        save_total_limit=10,
        push_to_hub=False,
        hub_model_id=None,
        gradient_checkpointing=True,
        lr_scheduler_type="constant",
        #max_seq_length=script_args.seq_length,
    )

In [14]:
_,  tokenizer = get_model_and_tokenizer()
formatting_prompts_func, response_template = get_formatting_prompts_func('alpaca', tokenizer.eos_token)
if response_template:
    response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[2:]   # Now we have it like in the dataset texts: `[2277, 29937, 4007, 22137, 29901]` for Llama2
    data_collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)
    packing = False
else:
    data_collator = None
    packing = True

Model loaded from HuggingFaceTB/SmolLM-135M
trainable params: 460,800 || all params: 134,975,808 || trainable%: 0.3414


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Pad token is set to <pad>.
Special tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': ['<|endoftext|>', '<|im_start|>', '<|im_end|>', '<repo_name>', '<reponame>', '<file_sep>', '<filename>', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<jupyter_script>', '<empty_output>']}


In [15]:
def default_evaluation(model, tokenizer, dataset, client_id, round, formatting_prompts_func, script_args, cluster_id=None):
    """
    Default evaluation function to compute model responses and ROUGE scores.
    """
    print("Evaluating model...")

    model.config.use_cache = True
    # Apply template to dataset
    dataset = utils.apply_template_to_dataset(dataset, formatting_prompts_func)
    dataset_length = len(dataset)
    eval_responses = utils.get_model_responses(model, tokenizer, dataset, batch_size=eval_batch_size)
    dataset_with_responses = dataset.select(range(len(dataset)))
    dataset_with_responses = dataset_with_responses.add_column('model_responses', eval_responses)
    scores = utils.calcule_rogue1(eval_responses, dataset_with_responses)
    print(f"Evaluation scores: {scores}")

    #verify output directory
    if not os.path.exists(os.path.join(output_dir, "evals")):
        os.makedirs(os.path.join(output_dir, "evals"))
    # Save evaluation results
    with open(os.path.join(output_dir, f"evals/rouge_client_{client_id}_cluster_{cluster_id}_round_{round}.json"), 'w') as f:
        scores['dataset_length'] = dataset_length
        json.dump(scores, f, indent=4)

    model.config.use_cache = False

In [16]:
for i, local_dataset in enumerate(local_datasets):

    model, tokenizer = get_model_and_tokenizer()

    test_dataset = local_datasets_test[i]

    print(f"Evaluating Pretrained Model on {test_dataset} for task {i}")
    default_evaluation(
                    model=model,
                    tokenizer=tokenizer,
                    dataset=test_dataset,
                    client_id=i,
                    round=0, #with respect to model from the previous round
                    formatting_prompts_func=formatting_prompts_func,
                    script_args=None,
                    cluster_id=0,
                )

    trainer = SFTTrainer(
            model=model,
            processing_class=tokenizer,
            args=training_args,
            #max_seq_length=script_args.seq_length,
            train_dataset=local_dataset,
            formatting_func=formatting_prompts_func,
            data_collator=data_collator,
            #packing=packing,
            #dataset_text_field=dataset_text_field,
        )

    print('Training model...')
    trainer.train()

    print('Saving model...')
    model.save_pretrained(os.path.join(output_dir, f"model_client_{i}_cluster_0_round_0"))

    print('Eval_posting model...')
    default_evaluation(
                    model=model,
                    tokenizer=tokenizer,
                    dataset=test_dataset,
                    client_id=i,
                    round=1, 
                    formatting_prompts_func=formatting_prompts_func,
                    script_args=None,
                    cluster_id=0,
                )
    
    print('Cleaning up...')
    del model
    del tokenizer
    torch.cuda.empty_cache()
    print('Done with client', i)
    

Model loaded from HuggingFaceTB/SmolLM-135M
trainable params: 460,800 || all params: 134,975,808 || trainable%: 0.3414
Pad token is set to <pad>.
Special tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': ['<|endoftext|>', '<|im_start|>', '<|im_end|>', '<repo_name>', '<reponame>', '<file_sep>', '<filename>', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<jupyter_script>', '<empty_output>']}
Evaluating Pretrained Model on Dataset({
    features: ['question', 'answer', 'passage', 'instruction', 'response', 'task', 'gem_id', 'gem_parent_id', 'input', 'target', 'references', 'category', 'webnlg_id', 'id', 'dialogue', 'summary', 'document'],
    num_rows: 1000
}) for task 0
Evaluating model...


  0%|          | 0/16 [00:00<?, ?it/s]

Generating responses for batch 1/16


  6%|▋         | 1/16 [00:46<11:32, 46.17s/it]

Generating responses for batch 2/16


 12%|█▎        | 2/16 [01:26<10:00, 42.89s/it]

Generating responses for batch 3/16


 19%|█▉        | 3/16 [02:11<09:26, 43.54s/it]

Generating responses for batch 4/16


 25%|██▌       | 4/16 [02:52<08:31, 42.64s/it]

Generating responses for batch 5/16


 31%|███▏      | 5/16 [03:44<08:27, 46.12s/it]

Generating responses for batch 6/16


 38%|███▊      | 6/16 [04:27<07:29, 44.92s/it]

Generating responses for batch 7/16


 44%|████▍     | 7/16 [05:10<06:39, 44.43s/it]

Generating responses for batch 8/16


 50%|█████     | 8/16 [06:06<06:25, 48.19s/it]

Generating responses for batch 9/16


 56%|█████▋    | 9/16 [06:50<05:27, 46.83s/it]

Generating responses for batch 10/16


 62%|██████▎   | 10/16 [07:33<04:33, 45.64s/it]

Generating responses for batch 11/16


 69%|██████▉   | 11/16 [08:13<03:39, 43.86s/it]

Generating responses for batch 12/16


 75%|███████▌  | 12/16 [08:54<02:51, 42.93s/it]

Generating responses for batch 13/16


 81%|████████▏ | 13/16 [09:37<02:09, 43.14s/it]

Generating responses for batch 14/16


 88%|████████▊ | 14/16 [10:18<01:24, 42.39s/it]

Generating responses for batch 15/16


 94%|█████████▍| 15/16 [11:01<00:42, 42.52s/it]

Generating responses for batch 16/16


100%|██████████| 16/16 [11:35<00:00, 43.50s/it]


Evaluation scores: {'rouge1': np.float64(0.018740440061651008), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.018754550740704538), 'rougeLsum': np.float64(0.018737733563621926)}


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Training model...


Step,Training Loss




Saving model...




Eval_posting model...
Evaluating model...


  0%|          | 0/16 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Generating responses for batch 1/16


  6%|▋         | 1/16 [00:45<11:26, 45.75s/it]

Generating responses for batch 2/16


 12%|█▎        | 2/16 [01:26<09:59, 42.84s/it]

Generating responses for batch 3/16


 19%|█▉        | 3/16 [02:11<09:27, 43.63s/it]

Generating responses for batch 4/16


 25%|██▌       | 4/16 [02:52<08:35, 42.93s/it]

Generating responses for batch 5/16


 31%|███▏      | 5/16 [03:45<08:28, 46.26s/it]

Generating responses for batch 6/16


 38%|███▊      | 6/16 [04:28<07:32, 45.30s/it]

Generating responses for batch 7/16


 44%|████▍     | 7/16 [05:12<06:44, 45.00s/it]

Generating responses for batch 8/16


 50%|█████     | 8/16 [06:09<06:28, 48.55s/it]

Generating responses for batch 9/16


 56%|█████▋    | 9/16 [06:54<05:32, 47.43s/it]

Generating responses for batch 10/16


 62%|██████▎   | 10/16 [07:37<04:38, 46.33s/it]

Generating responses for batch 11/16


 69%|██████▉   | 11/16 [08:19<03:43, 44.73s/it]

Generating responses for batch 12/16


 75%|███████▌  | 12/16 [09:00<02:55, 43.79s/it]

Generating responses for batch 13/16


 81%|████████▏ | 13/16 [09:45<02:11, 43.99s/it]

Generating responses for batch 14/16


 88%|████████▊ | 14/16 [10:27<01:26, 43.39s/it]

Generating responses for batch 15/16


 94%|█████████▍| 15/16 [11:10<00:43, 43.36s/it]

Generating responses for batch 16/16


100%|██████████| 16/16 [11:47<00:00, 44.24s/it]


Evaluation scores: {'rouge1': np.float64(0.046807553259916125), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.0467986680950422), 'rougeLsum': np.float64(0.04678239528162925)}
Cleaning up...
Done with client 0
Model loaded from HuggingFaceTB/SmolLM-135M
trainable params: 460,800 || all params: 134,975,808 || trainable%: 0.3414
Pad token is set to <pad>.
Special tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': ['<|endoftext|>', '<|im_start|>', '<|im_end|>', '<repo_name>', '<reponame>', '<file_sep>', '<filename>', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<jupyter_script>', '<empty_output>']}
Evaluating Pretrained Model on Dataset({
    features: ['question', 'answer', 'passage', 'instruction', 'response', 'task', 'gem_id', 'gem_parent_id', 'input', 'target', 'references', 'category', 

  0%|          | 0/16 [00:00<?, ?it/s]

Generating responses for batch 1/16


  6%|▋         | 1/16 [00:34<08:35, 34.38s/it]

Generating responses for batch 2/16


 12%|█▎        | 2/16 [01:08<07:58, 34.20s/it]

Generating responses for batch 3/16


 19%|█▉        | 3/16 [01:43<07:27, 34.44s/it]

Generating responses for batch 4/16


 25%|██▌       | 4/16 [02:17<06:52, 34.36s/it]

Generating responses for batch 5/16


 31%|███▏      | 5/16 [02:51<06:18, 34.41s/it]

Generating responses for batch 6/16


 38%|███▊      | 6/16 [03:26<05:44, 34.49s/it]

Generating responses for batch 7/16


 44%|████▍     | 7/16 [04:01<05:10, 34.53s/it]

Generating responses for batch 8/16


 50%|█████     | 8/16 [04:35<04:36, 34.59s/it]

Generating responses for batch 9/16


 56%|█████▋    | 9/16 [05:10<04:02, 34.59s/it]

Generating responses for batch 10/16


 62%|██████▎   | 10/16 [05:45<03:27, 34.64s/it]

Generating responses for batch 11/16


 69%|██████▉   | 11/16 [06:19<02:52, 34.60s/it]

Generating responses for batch 12/16


 75%|███████▌  | 12/16 [06:54<02:18, 34.62s/it]

Generating responses for batch 13/16


 81%|████████▏ | 13/16 [07:29<01:43, 34.61s/it]

Generating responses for batch 14/16


 88%|████████▊ | 14/16 [08:03<01:09, 34.66s/it]

Generating responses for batch 15/16


 94%|█████████▍| 15/16 [08:39<00:34, 34.83s/it]

Generating responses for batch 16/16


100%|██████████| 16/16 [09:12<00:00, 34.52s/it]


Evaluation scores: {'rouge1': np.float64(0.11681687011132033), 'rouge2': np.float64(0.10458759847725464), 'rougeL': np.float64(0.11678476297742833), 'rougeLsum': np.float64(0.11662706764080588)}


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Training model...


Step,Training Loss




Saving model...




Eval_posting model...
Evaluating model...


  0%|          | 0/16 [00:00<?, ?it/s]

Generating responses for batch 1/16


  6%|▋         | 1/16 [00:37<09:21, 37.47s/it]

Generating responses for batch 2/16


 12%|█▎        | 2/16 [01:15<08:46, 37.63s/it]

Generating responses for batch 3/16


 19%|█▉        | 3/16 [01:53<08:10, 37.74s/it]

Generating responses for batch 4/16


 25%|██▌       | 4/16 [02:30<07:31, 37.64s/it]

Generating responses for batch 5/16


 31%|███▏      | 5/16 [03:08<06:55, 37.73s/it]

Generating responses for batch 6/16


 38%|███▊      | 6/16 [03:46<06:17, 37.71s/it]

Generating responses for batch 7/16


 44%|████▍     | 7/16 [04:24<05:39, 37.77s/it]

Generating responses for batch 8/16


 50%|█████     | 8/16 [05:01<05:02, 37.78s/it]

Generating responses for batch 9/16


 56%|█████▋    | 9/16 [05:39<04:23, 37.69s/it]

Generating responses for batch 10/16


 62%|██████▎   | 10/16 [06:17<03:46, 37.75s/it]

Generating responses for batch 11/16


 69%|██████▉   | 11/16 [06:55<03:09, 37.84s/it]

Generating responses for batch 12/16


 75%|███████▌  | 12/16 [07:33<02:31, 37.88s/it]

Generating responses for batch 13/16


 81%|████████▏ | 13/16 [08:11<01:53, 37.90s/it]

Generating responses for batch 14/16


 88%|████████▊ | 14/16 [08:49<01:15, 37.99s/it]

Generating responses for batch 15/16


 94%|█████████▍| 15/16 [09:27<00:37, 37.94s/it]

Generating responses for batch 16/16


100%|██████████| 16/16 [10:04<00:00, 37.79s/it]


Evaluation scores: {'rouge1': np.float64(0.04813122472056367), 'rouge2': np.float64(0.04251516738514409), 'rougeL': np.float64(0.048201112022375106), 'rougeLsum': np.float64(0.048107886176872006)}
Cleaning up...
Done with client 1
Model loaded from HuggingFaceTB/SmolLM-135M
trainable params: 460,800 || all params: 134,975,808 || trainable%: 0.3414
Pad token is set to <pad>.
Special tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': ['<|endoftext|>', '<|im_start|>', '<|im_end|>', '<repo_name>', '<reponame>', '<file_sep>', '<filename>', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<jupyter_script>', '<empty_output>']}
Evaluating Pretrained Model on Dataset({
    features: ['question', 'answer', 'passage', 'instruction', 'response', 'task', 'gem_id', 'gem_parent_id', 'input', 'target', 'referen

  0%|          | 0/16 [00:00<?, ?it/s]

Generating responses for batch 1/16


  6%|▋         | 1/16 [01:01<15:19, 61.28s/it]

Generating responses for batch 2/16


 12%|█▎        | 2/16 [01:56<13:26, 57.59s/it]

Generating responses for batch 3/16


 19%|█▉        | 3/16 [02:46<11:42, 54.07s/it]

Generating responses for batch 4/16


 25%|██▌       | 4/16 [03:47<11:25, 57.12s/it]

Generating responses for batch 5/16


 31%|███▏      | 5/16 [04:35<09:49, 53.56s/it]

Generating responses for batch 6/16


 38%|███▊      | 6/16 [05:26<08:47, 52.75s/it]

Generating responses for batch 7/16


 44%|████▍     | 7/16 [06:20<07:59, 53.32s/it]

Generating responses for batch 8/16


 50%|█████     | 8/16 [07:22<07:28, 56.07s/it]

Generating responses for batch 9/16


 56%|█████▋    | 9/16 [08:23<06:41, 57.42s/it]

Generating responses for batch 10/16


 62%|██████▎   | 10/16 [09:25<05:53, 58.84s/it]

Generating responses for batch 11/16


 69%|██████▉   | 11/16 [10:18<04:46, 57.28s/it]

Generating responses for batch 12/16


 75%|███████▌  | 12/16 [11:10<03:41, 55.48s/it]

Generating responses for batch 13/16


 81%|████████▏ | 13/16 [12:21<03:00, 60.29s/it]

Generating responses for batch 14/16


 88%|████████▊ | 14/16 [13:12<01:55, 57.52s/it]

Generating responses for batch 15/16


 94%|█████████▍| 15/16 [14:15<00:59, 59.13s/it]

Generating responses for batch 16/16


100%|██████████| 16/16 [14:57<00:00, 56.12s/it]


Evaluation scores: {'rouge1': np.float64(0.270766649012103), 'rouge2': np.float64(0.26265589117276744), 'rougeL': np.float64(0.27113657414891246), 'rougeLsum': np.float64(0.2707205860225894)}


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Training model...


Step,Training Loss




Saving model...




Eval_posting model...
Evaluating model...


  0%|          | 0/16 [00:00<?, ?it/s]

Generating responses for batch 1/16


  6%|▋         | 1/16 [01:01<15:23, 61.55s/it]

Generating responses for batch 2/16


 12%|█▎        | 2/16 [01:56<13:29, 57.82s/it]

Generating responses for batch 3/16


 19%|█▉        | 3/16 [02:46<11:45, 54.28s/it]

Generating responses for batch 4/16


 25%|██▌       | 4/16 [03:49<11:28, 57.40s/it]

Generating responses for batch 5/16


 31%|███▏      | 5/16 [04:36<09:53, 53.95s/it]

Generating responses for batch 6/16


 38%|███▊      | 6/16 [05:28<08:50, 53.09s/it]

Generating responses for batch 7/16


 44%|████▍     | 7/16 [06:23<08:02, 53.64s/it]

Generating responses for batch 8/16


 50%|█████     | 8/16 [07:25<07:31, 56.39s/it]

Generating responses for batch 9/16


 56%|█████▋    | 9/16 [08:25<06:44, 57.72s/it]

Generating responses for batch 10/16


 62%|██████▎   | 10/16 [09:28<05:54, 59.15s/it]

Generating responses for batch 11/16


 69%|██████▉   | 11/16 [10:22<04:47, 57.58s/it]

Generating responses for batch 12/16


 75%|███████▌  | 12/16 [11:14<03:43, 55.80s/it]

Generating responses for batch 13/16


 81%|████████▏ | 13/16 [12:25<03:01, 60.53s/it]

Generating responses for batch 14/16


 88%|████████▊ | 14/16 [13:16<01:55, 57.73s/it]

Generating responses for batch 15/16


 94%|█████████▍| 15/16 [14:19<00:59, 59.38s/it]

Generating responses for batch 16/16


100%|██████████| 16/16 [15:03<00:00, 56.45s/it]


Evaluation scores: {'rouge1': np.float64(0.48943824062493857), 'rouge2': np.float64(0.4828134926479013), 'rougeL': np.float64(0.4896902814679358), 'rougeLsum': np.float64(0.48995357174975707)}
Cleaning up...
Done with client 2
Model loaded from HuggingFaceTB/SmolLM-135M
trainable params: 460,800 || all params: 134,975,808 || trainable%: 0.3414
Pad token is set to <pad>.
Special tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': ['<|endoftext|>', '<|im_start|>', '<|im_end|>', '<repo_name>', '<reponame>', '<file_sep>', '<filename>', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<jupyter_script>', '<empty_output>']}
Evaluating Pretrained Model on Dataset({
    features: ['question', 'answer', 'passage', 'instruction', 'response', 'task', 'gem_id', 'gem_parent_id', 'input', 'target', 'references'

  0%|          | 0/16 [00:00<?, ?it/s]

Generating responses for batch 1/16


  6%|▋         | 1/16 [00:36<09:04, 36.33s/it]

Generating responses for batch 2/16


 12%|█▎        | 2/16 [01:12<08:30, 36.44s/it]

Generating responses for batch 3/16


 19%|█▉        | 3/16 [01:50<08:02, 37.13s/it]

Generating responses for batch 4/16


 25%|██▌       | 4/16 [02:27<07:24, 37.01s/it]

Generating responses for batch 5/16


 31%|███▏      | 5/16 [03:04<06:46, 36.95s/it]

Generating responses for batch 6/16


 38%|███▊      | 6/16 [03:40<06:07, 36.75s/it]

Generating responses for batch 7/16


 44%|████▍     | 7/16 [04:18<05:33, 37.11s/it]

Generating responses for batch 8/16


 50%|█████     | 8/16 [04:55<04:56, 37.11s/it]

Generating responses for batch 9/16


 56%|█████▋    | 9/16 [05:32<04:18, 36.96s/it]

Generating responses for batch 10/16


 62%|██████▎   | 10/16 [06:08<03:40, 36.81s/it]

Generating responses for batch 11/16


 69%|██████▉   | 11/16 [06:45<03:04, 36.89s/it]

Generating responses for batch 12/16


 75%|███████▌  | 12/16 [07:25<02:30, 37.63s/it]

Generating responses for batch 13/16


 81%|████████▏ | 13/16 [08:03<01:53, 37.79s/it]

Generating responses for batch 14/16


 88%|████████▊ | 14/16 [08:40<01:15, 37.68s/it]

Generating responses for batch 15/16


 94%|█████████▍| 15/16 [09:18<00:37, 37.63s/it]

Generating responses for batch 16/16


100%|██████████| 16/16 [09:52<00:00, 37.04s/it]


Evaluation scores: {'rouge1': np.float64(0.3083429943948779), 'rouge2': np.float64(0.29688067350224506), 'rougeL': np.float64(0.3081881722091883), 'rougeLsum': np.float64(0.3088582346306743)}


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Training model...


Step,Training Loss




Saving model...




Eval_posting model...
Evaluating model...


  0%|          | 0/16 [00:00<?, ?it/s]

Generating responses for batch 1/16


  6%|▋         | 1/16 [00:38<09:37, 38.47s/it]

Generating responses for batch 2/16


 12%|█▎        | 2/16 [01:16<08:57, 38.43s/it]

Generating responses for batch 3/16


 19%|█▉        | 3/16 [01:56<08:28, 39.12s/it]

Generating responses for batch 4/16


 25%|██▌       | 4/16 [02:35<07:49, 39.15s/it]

Generating responses for batch 5/16


 31%|███▏      | 5/16 [03:14<07:10, 39.09s/it]

Generating responses for batch 6/16


 38%|███▊      | 6/16 [03:53<06:30, 39.01s/it]

Generating responses for batch 7/16


 44%|████▍     | 7/16 [04:33<05:53, 39.31s/it]

Generating responses for batch 8/16


 50%|█████     | 8/16 [05:12<05:14, 39.28s/it]

Generating responses for batch 9/16


 56%|█████▋    | 9/16 [05:52<04:34, 39.22s/it]

Generating responses for batch 10/16


 62%|██████▎   | 10/16 [06:31<03:55, 39.27s/it]

Generating responses for batch 11/16


 69%|██████▉   | 11/16 [07:10<03:16, 39.29s/it]

Generating responses for batch 12/16


 75%|███████▌  | 12/16 [07:52<02:39, 39.97s/it]

Generating responses for batch 13/16


 81%|████████▏ | 13/16 [08:32<01:59, 39.89s/it]

Generating responses for batch 14/16


 88%|████████▊ | 14/16 [09:11<01:19, 39.79s/it]

Generating responses for batch 15/16


 94%|█████████▍| 15/16 [09:51<00:39, 39.84s/it]

Generating responses for batch 16/16


100%|██████████| 16/16 [10:29<00:00, 39.34s/it]


Evaluation scores: {'rouge1': np.float64(0.4289135046595898), 'rouge2': np.float64(0.41690350201907306), 'rougeL': np.float64(0.4287601031867354), 'rougeLsum': np.float64(0.4294298872694082)}
Cleaning up...
Done with client 3


In [17]:
#Mixturing all datasets

dataset.shuffle(seed=0)
dataset = dataset.select(range(10000))  # Limit to 400000 samples for training

model, tokenizer = get_model_and_tokenizer()

trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        args=training_args,
        #max_seq_length=script_args.seq_length,
        train_dataset=dataset,
        formatting_func=formatting_prompts_func,
        data_collator=data_collator,
        #packing=packing,
        #dataset_text_field=dataset_text_field,
    )

print('Training model...')
trainer.train()

print('Saving model...')
model.save_pretrained(os.path.join(output_dir, f"model_client_all_cluster_0_round_0"))

for i, test_dataset in enumerate(local_datasets_test):

    print('Eval_posting model...')
    default_evaluation(
                    model=model,
                    tokenizer=tokenizer,
                    dataset=test_dataset,
                    client_id='all',
                    round=1, 
                    formatting_prompts_func=formatting_prompts_func,
                    script_args=None,
                    cluster_id=i,
                )

print('Cleaning up...')
del model
del tokenizer
torch.cuda.empty_cache()
print('Done with client', i)

Model loaded from HuggingFaceTB/SmolLM-135M
trainable params: 460,800 || all params: 134,975,808 || trainable%: 0.3414
Pad token is set to <pad>.
Special tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': ['<|endoftext|>', '<|im_start|>', '<|im_end|>', '<repo_name>', '<reponame>', '<file_sep>', '<filename>', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<jupyter_script>', '<empty_output>']}


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Training model...


Step,Training Loss




Saving model...




Eval_posting model...
Evaluating model...


  0%|          | 0/16 [00:00<?, ?it/s]

Generating responses for batch 1/16


  6%|▋         | 1/16 [00:46<11:31, 46.07s/it]

Generating responses for batch 2/16


 12%|█▎        | 2/16 [01:27<10:04, 43.16s/it]

Generating responses for batch 3/16


 19%|█▉        | 3/16 [02:11<09:30, 43.90s/it]

Generating responses for batch 4/16


 25%|██▌       | 4/16 [02:54<08:39, 43.31s/it]

Generating responses for batch 5/16


 31%|███▏      | 5/16 [03:46<08:32, 46.63s/it]

Generating responses for batch 6/16


 38%|███▊      | 6/16 [04:30<07:37, 45.70s/it]

Generating responses for batch 7/16


 44%|████▍     | 7/16 [05:15<06:48, 45.40s/it]

Generating responses for batch 8/16


 50%|█████     | 8/16 [06:12<06:31, 48.93s/it]

Generating responses for batch 9/16


 56%|█████▋    | 9/16 [06:57<05:34, 47.75s/it]

Generating responses for batch 10/16


 62%|██████▎   | 10/16 [07:41<04:40, 46.73s/it]

Generating responses for batch 11/16


 69%|██████▉   | 11/16 [08:23<03:46, 45.22s/it]

Generating responses for batch 12/16


 75%|███████▌  | 12/16 [09:05<02:57, 44.35s/it]

Generating responses for batch 13/16


 81%|████████▏ | 13/16 [09:50<02:13, 44.58s/it]

Generating responses for batch 14/16


 88%|████████▊ | 14/16 [10:33<01:27, 43.93s/it]

Generating responses for batch 15/16


 94%|█████████▍| 15/16 [11:17<00:43, 43.92s/it]

Generating responses for batch 16/16


100%|██████████| 16/16 [11:54<00:00, 44.69s/it]


Evaluation scores: {'rouge1': np.float64(0.012700287551346738), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.012695290630433585), 'rougeLsum': np.float64(0.01265991766294191)}
Eval_posting model...
Evaluating model...


  0%|          | 0/16 [00:00<?, ?it/s]

Generating responses for batch 1/16


  6%|▋         | 1/16 [00:37<09:25, 37.73s/it]

Generating responses for batch 2/16


 12%|█▎        | 2/16 [01:15<08:44, 37.50s/it]

Generating responses for batch 3/16


 19%|█▉        | 3/16 [01:52<08:08, 37.62s/it]

Generating responses for batch 4/16


 25%|██▌       | 4/16 [02:30<07:33, 37.83s/it]

Generating responses for batch 5/16


 31%|███▏      | 5/16 [03:09<06:58, 38.09s/it]

Generating responses for batch 6/16


 38%|███▊      | 6/16 [03:48<06:22, 38.26s/it]

Generating responses for batch 7/16


 44%|████▍     | 7/16 [04:26<05:45, 38.36s/it]

Generating responses for batch 8/16


 50%|█████     | 8/16 [05:04<05:06, 38.33s/it]

Generating responses for batch 9/16


 56%|█████▋    | 9/16 [05:43<04:28, 38.41s/it]

Generating responses for batch 10/16


 62%|██████▎   | 10/16 [06:22<03:51, 38.52s/it]

Generating responses for batch 11/16


 69%|██████▉   | 11/16 [07:00<03:12, 38.45s/it]

Generating responses for batch 12/16


 75%|███████▌  | 12/16 [07:38<02:33, 38.42s/it]

Generating responses for batch 13/16


 81%|████████▏ | 13/16 [08:17<01:55, 38.38s/it]

Generating responses for batch 14/16


 88%|████████▊ | 14/16 [08:55<01:16, 38.39s/it]

Generating responses for batch 15/16


 94%|█████████▍| 15/16 [09:34<00:38, 38.40s/it]

Generating responses for batch 16/16


100%|██████████| 16/16 [10:11<00:00, 38.24s/it]


Evaluation scores: {'rouge1': np.float64(0.2150591565479943), 'rouge2': np.float64(0.20074546676235783), 'rougeL': np.float64(0.21469820831416164), 'rougeLsum': np.float64(0.21485072059118804)}
Eval_posting model...
Evaluating model...


  0%|          | 0/16 [00:00<?, ?it/s]

Generating responses for batch 1/16


  6%|▋         | 1/16 [01:01<15:21, 61.46s/it]

Generating responses for batch 2/16


 12%|█▎        | 2/16 [01:56<13:28, 57.72s/it]

Generating responses for batch 3/16


 19%|█▉        | 3/16 [02:46<11:45, 54.27s/it]

Generating responses for batch 4/16


 25%|██▌       | 4/16 [03:48<11:27, 57.30s/it]

Generating responses for batch 5/16


 31%|███▏      | 5/16 [04:36<09:53, 53.91s/it]

Generating responses for batch 6/16


 38%|███▊      | 6/16 [05:27<08:50, 53.04s/it]

Generating responses for batch 7/16


 44%|████▍     | 7/16 [06:22<08:02, 53.56s/it]

Generating responses for batch 8/16


 50%|█████     | 8/16 [07:24<07:30, 56.29s/it]

Generating responses for batch 9/16


 56%|█████▋    | 9/16 [08:25<06:43, 57.60s/it]

Generating responses for batch 10/16


 62%|██████▎   | 10/16 [09:27<05:54, 59.00s/it]

Generating responses for batch 11/16


 69%|██████▉   | 11/16 [10:21<04:47, 57.41s/it]

Generating responses for batch 12/16


 75%|███████▌  | 12/16 [11:12<03:42, 55.61s/it]

Generating responses for batch 13/16


 81%|████████▏ | 13/16 [12:24<03:01, 60.39s/it]

Generating responses for batch 14/16


 88%|████████▊ | 14/16 [13:15<01:55, 57.59s/it]

Generating responses for batch 15/16


 94%|█████████▍| 15/16 [14:18<00:59, 59.22s/it]

Generating responses for batch 16/16


100%|██████████| 16/16 [15:01<00:00, 56.32s/it]


Evaluation scores: {'rouge1': np.float64(0.1944023076568278), 'rouge2': np.float64(0.18790906077108666), 'rougeL': np.float64(0.1949347627479115), 'rougeLsum': np.float64(0.19432038995031337)}
Eval_posting model...
Evaluating model...


  0%|          | 0/16 [00:00<?, ?it/s]

Generating responses for batch 1/16


  6%|▋         | 1/16 [00:38<09:42, 38.85s/it]

Generating responses for batch 2/16


 12%|█▎        | 2/16 [01:18<09:09, 39.25s/it]

Generating responses for batch 3/16


 19%|█▉        | 3/16 [01:58<08:37, 39.82s/it]

Generating responses for batch 4/16


 25%|██▌       | 4/16 [02:38<07:56, 39.73s/it]

Generating responses for batch 5/16


 31%|███▏      | 5/16 [03:18<07:16, 39.68s/it]

Generating responses for batch 6/16


 38%|███▊      | 6/16 [03:57<06:35, 39.50s/it]

Generating responses for batch 7/16


 44%|████▍     | 7/16 [04:37<05:57, 39.77s/it]

Generating responses for batch 8/16


 50%|█████     | 8/16 [05:17<05:18, 39.83s/it]

Generating responses for batch 9/16


 56%|█████▋    | 9/16 [05:56<04:37, 39.71s/it]

Generating responses for batch 10/16


 62%|██████▎   | 10/16 [06:36<03:57, 39.54s/it]

Generating responses for batch 11/16


 69%|██████▉   | 11/16 [07:15<03:18, 39.62s/it]

Generating responses for batch 12/16


 75%|███████▌  | 12/16 [07:57<02:40, 40.10s/it]

Generating responses for batch 13/16


 81%|████████▏ | 13/16 [08:36<01:59, 39.96s/it]

Generating responses for batch 14/16


 88%|████████▊ | 14/16 [09:16<01:19, 39.95s/it]

Generating responses for batch 15/16


 94%|█████████▍| 15/16 [09:56<00:39, 39.96s/it]

Generating responses for batch 16/16


100%|██████████| 16/16 [10:34<00:00, 39.64s/it]


Evaluation scores: {'rouge1': np.float64(0.2978158291546483), 'rouge2': np.float64(0.29063551267582544), 'rougeL': np.float64(0.2973015274532096), 'rougeLsum': np.float64(0.2976449026477013)}
Cleaning up...
Done with client 3
