In [108]:
import sys
import os
from tqdm import tqdm
import numpy as np
import torch
sys.path.append(".")
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from datasets import load_dataset, concatenate_datasets
from accelerate import Accelerator
from torch.utils.data import DataLoader
from utils.template import TEMPLATE_DICT
import json
import pandas as pd

In [109]:
os.environ['CUDA_VISIBLE_DEVICES'] = '5'

In [110]:
def load_data(DATASET_NAME, tasks, eval=False):
    if DATASET_NAME == "databricks/databricks-dolly-15k":
        dataset = load_dataset(DATASET_NAME, split="train")
        dataset = dataset.train_test_split(test_size=0.2, seed=0)
        dataset = dataset['test'] if eval else dataset['train']
        dataset = dataset.filter(lambda x: x['category'] in tasks)
        dataset = dataset.map(dolly_format)
        return dataset

    if DATASET_NAME == "CohereForAI/aya_dataset":
        dataset = load_dataset(DATASET_NAME, split="train")
        languages = ['English', 'Swedish', 'German', 'Portuguese', 'Spanish']
        dataset = dataset.filter(lambda x: x['language'] in languages)
        dataset = dataset.train_test_split(test_size=0.2, seed=0)
        dataset = dataset['test'] if eval else dataset['train']
        tasks = [task.capitalize() for task in tasks]
        dataset = dataset.filter(lambda x: x['language'] in tasks)
        dataset = dataset.map(aya_format)
        return dataset

    if DATASET_NAME == 'multitask':
        if tasks == 'boolq' or 'boolq' in tasks:
            dataset = prepare_boolq(eval=eval).shuffle(seed=0)
            return dataset
        if tasks == 'webnlg' or 'webnlg' in tasks:
            dataset = prepare_webnlg(eval=eval).shuffle(seed=0)
            return dataset
        if tasks == 'samsum' or 'samsum' in tasks:
            dataset = prepare_samsum(eval=eval).shuffle(seed=0)
            return dataset
        if tasks == 'gigaword' or 'gigaword' in tasks:
            dataset = prepare_gigaword(eval=eval).shuffle(seed=0)
            return dataset
        if tasks == 'all_tasks' or 'all_tasks' in tasks:
            boolq = prepare_boolq(eval=eval).map(lambda x: {'instruction': x['instruction'], 'response': x['response'], 'task': 'boolq'})
            webnlg = prepare_webnlg(eval=eval).map(lambda x: {'instruction': x['instruction'], 'response': x['response'], 'task': 'webnlg'})
            samsum = prepare_samsum(eval=eval).map(lambda x: {'instruction': x['instruction'], 'response': x['response'], 'task': 'samsum'})
            gigaword = prepare_gigaword(eval=eval).map(lambda x: {'instruction': x['instruction'], 'response': x['response'], 'task': 'gigaword'})
            dataset = concatenate_datasets([boolq, webnlg, samsum, gigaword]).shuffle(seed=0)
            return dataset

def prepare_webnlg(eval=False):
    dataset = load_dataset('GEM/web_nlg', 'en', split='train')
    dataset = dataset.train_test_split(test_size=0.2, seed=0)
    dataset = dataset['test'] if eval else dataset['train']
    dataset = dataset.map(webnlg_format)
    return dataset

def prepare_boolq(eval=False):
    dataset = load_dataset('google/boolq', split='train')
    dataset = dataset.train_test_split(test_size=0.2, seed=0)
    dataset = dataset['test'] if eval else dataset['train']
    dataset = dataset.map(boolq_format)
    return dataset

def prepare_samsum(eval=False):
    dataset = load_dataset('Samsung/samsum', split='train', trust_remote_code=True)
    dataset = dataset.train_test_split(test_size=0.2, seed=0)
    dataset = dataset['test'] if eval else dataset['train']
    dataset = dataset.map(samsum_format)
    return dataset

def prepare_gigaword(eval=False):
    dataset = load_dataset('Harvard/gigaword', split='train', trust_remote_code=True)
    dataset = dataset.train_test_split(test_size=0.2, seed=0)
    dataset = dataset['test'] if eval else dataset['train']
    dataset = dataset.shuffle(seed=0)
    dataset = dataset.select(range(30000))
    dataset = dataset.map(gigaword_format)
    return dataset

def boolq_format(example):
    #example["instruction"] = example['passage'] + " Based on the passage, answer this question:" + example['question']
    example["instruction"] = example['passage'] + '-' + example['question']
    example["response"] = str(example['answer'])
    return example

def webnlg_format(example):
    example['input'] = str(example['input'])
    #example["instruction"] = "Organize this data into a readable text: " + example['input']
    example["instruction"] = example['input']
    example["response"] = example['target']
    return example

def samsum_format(example):
    #example["instruction"] = "Summarize this conversation: " + example['dialogue']
    example["instruction"] = example['dialogue']
    example["response"] = example['summary']
    return example

def gigaword_format(example):
    #example["instruction"] = "Summarize this text: " + example['document']
    example["instruction"] = example['document']
    example["response"] = example['summary']
    return example

def dolly_format(example):
    if example['context'] == "":
        example["inputs"] = example["instruction"]
    else:
        example["inputs"] = example["instruction"] + " " + example['context']
    return example

def aya_format(example):
    example["instruction"] = example['inputs']
    example["response"] = example['targets']
    return example

alpaca_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{} 

### Response: {}{}"""

TEMPLATE_DICT = {
    'alpaca': (alpaca_template, '\n### Response:'),
}

def tokenize_function(examples):
    inputs = tokenizer(examples["inputs"], return_tensors="pt", padding='max_length', truncation=True, max_length=512)
    targets = inputs.copy()
    return {
        "input_ids": inputs["input_ids"].squeeze(),
        "attention_mask": inputs["attention_mask"].squeeze(),
        "labels": targets["input_ids"].squeeze()
    }

def format_instruction(instruction, response, eos):
    template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{} 

### Response: {}{}"""
    return template.format(instruction, response, eos)

def apply_template_to_dataset(dataset):
    dataset = dataset.map(lambda x: {'inputs': format_instruction(x, '', '')})
    return dataset

def get_formatting_prompts_func_test(template_name, eos_token):
    if template_name in TEMPLATE_DICT:
        overall_temp, response_temp = TEMPLATE_DICT[template_name]
        def formatting_prompts_func(example):    
            text = overall_temp.format(example['instruction'], '', '')
            return text
    elif template_name == 'ag_news':
        formatting_prompts_func = None
        response_temp = None
    return formatting_prompts_func, response_temp

In [111]:
def load_model(path, MODEL_NAME, DEVICE = 'cuda'):
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16,
                                                    quantization_config = BitsAndBytesConfig(
                                                                            load_in_4bit=True,
                                                                            bnb_4bit_use_double_quant=True,
                                                                            bnb_4bit_quant_type="nf4",
                                                                            bnb_4bit_compute_dtype=torch.bfloat16,
                                                                        ),
                                                    device_map={"": Accelerator().local_process_index})

    model = PeftModel.from_pretrained(model, path).to(DEVICE)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, device=DEVICE, use_fast=False, padding_side="left")
    #tokenizer.pad_token = tokenizer.unk_token

    return model, tokenizer

In [None]:
model_name  =  'unsloth/Llama-3.2-1B'
cluster, round = 3, 200

model_path  =  f'/home/gabriel.talasso/FT_LLM_FL/output_multitask/SmolLM-360M/wo_formatting_fedavg_multitask_clustered_c20s5_i10_b16a1_l1024_r8a16_20250331161819/cluster_{cluster}_checkpoint-{round}'
model_path =   f'/home/gabriel.talasso/FT_LLM_FL/output_multitask/SmolLM-360M/wo_formatting_clustered_multitask_clustered_c20s5_i10_b16a1_l1024_r8a16_20250331134834/cluster_{cluster}_checkpoint-{round}'
model_path = '/home/gabriel.talasso/FT_LLM_FL/output_multitask/Llama-3.2-1B/clustered_multitask_clustered_c20s5_i10_b16a1_l1024_r8a16_20250401151553/cluster_0_checkpoint-200'
#model_path = '/home/gabriel.talasso/FT_LLM_FL/output_multitask/Llama-3.2-1B/fedavg_multitask_clustered_c20s5_i10_b16a1_l1024_r8a16_20250401152254/cluster_0_checkpoint-200'

model, tokenizer = load_model(model_path, model_name)

In [113]:
dataset = load_data('multitask', 'boolq', eval=True)
formatting_prompts_func, _ = get_formatting_prompts_func_test('alpaca', '\n### Response:')
dataset = dataset.map(lambda x: {'inputs': formatting_prompts_func(x), 'targets': x['response']})
dataset = dataset.shuffle(seed=0)

In [114]:
dataset = dataset.select(range(10))

In [115]:
def generate_response(model, tokenizer, dataset):
    model.eval()
    responses = []
    for i in tqdm(range(len(dataset))):
        inputs = dataset[i]['inputs']
        inputs = tokenizer(inputs, return_tensors='pt', max_length=512).to(model.device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=128, do_sample=False, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        responses.append(response)
        #print(f"Example {i}:")
        #print(f"Instruction: {dataset[i]['inputs']}")
        #print(f"Response: {response}")
        #print("-" * 50)
    return responses

responses = generate_response(model, tokenizer, dataset)
dataset = dataset.add_column('responses', responses)

  0%|          | 0/10 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 10/10 [00:01<00:00,  7.25it/s]


In [116]:
responses

["Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nAn offensive strategy would be holding on to Wild and Wild Draw Four cards because they can be played near the end of the hand in order to go out (when it's harder to play a matching card). However, a defensive strategy would advise getting rid of such cards early, because they have a high point value.-can you hold a draw four in uno \n\n### Response:  False",
 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nBen Nevis (Scottish Gaelic: Beinn Nibheis, pronounced (peˈɲivəʃ); English: /bɛnˈnɛvɪs/) is the highest mountain in the British Isles, located in Scotland. Standing at 1,345 metres (4,411 ft) above sea level, it is located at the western end of the Grampian Mountains in the Lochaber area of the Scottish Highlands, close to the town of Fort William.-is ben nevis the highest moun

In [117]:
# rouge score between the responses and the targets for all examples
from evaluate import load

rouge_metric = load("rouge")
predictions = dataset['responses']
references = [inp + tar for inp, tar in zip(dataset['inputs'], dataset['targets'])]
results = rouge_metric.compute(predictions=predictions, references=references, use_stemmer=True)
rouge1 = results['rouge1']
rouge1

np.float64(0.9846728933444713)

In [118]:
# rouge score between the responses and the targets for all examples
from evaluate import load

rouge_metric = load("rouge")
predictions = [inp.split('### Response: ')[1] for inp in dataset['responses']]
references = [tar for tar in  dataset['targets']]


results = rouge_metric.compute(predictions=predictions, references=references, use_stemmer=True)
rouge1 = results['rouge1']
rouge1

np.float64(0.5)

In [119]:
from rouge_score import rouge_scorer
import numpy as np

predictions = dataset['responses']
references = [inp + tar for inp, tar in zip(dataset['inputs'], dataset['targets'])]

scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
rouge1_scores_precision = [scorer.score(ref, pred)['rouge1'] .precision for ref, pred in zip(references, predictions)]
rouge1_scores_recall = [scorer.score(ref, pred)['rouge1'].recall for ref, pred in zip(references, predictions)]
rouge1_scores = [scorer.score(ref, pred)['rouge1'].fmeasure for ref, pred in zip(references, predictions)]

rouge1_precision = np.mean(rouge1_scores_precision)
rouge1_recall = np.mean(rouge1_scores_recall)
rouge1_fmeasure = np.mean(rouge1_scores)

In [120]:
print(f"Rouge-1 Precision: {rouge1_precision}")
print(f"Rouge-1 Recall: {rouge1_recall}")
print(f"Rouge-1 F-measure: {rouge1_fmeasure}")

Rouge-1 Precision: 0.9754298178601584
Rouge-1 Recall: 0.9947377168503424
Rouge-1 F-measure: 0.9846728933444713


In [121]:
predictions

["Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nAn offensive strategy would be holding on to Wild and Wild Draw Four cards because they can be played near the end of the hand in order to go out (when it's harder to play a matching card). However, a defensive strategy would advise getting rid of such cards early, because they have a high point value.-can you hold a draw four in uno \n\n### Response:  False",
 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nBen Nevis (Scottish Gaelic: Beinn Nibheis, pronounced (peˈɲivəʃ); English: /bɛnˈnɛvɪs/) is the highest mountain in the British Isles, located in Scotland. Standing at 1,345 metres (4,411 ft) above sea level, it is located at the western end of the Grampian Mountains in the Lochaber area of the Scottish Highlands, close to the town of Fort William.-is ben nevis the highest moun

In [122]:
from rouge_score import rouge_scorer
import numpy as np

predictions = [inp.split('### Response: ')[1] for inp in dataset['responses']]
references = [tar for tar in  dataset['targets']]

scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
rouge1_scores_precision = [scorer.score(ref, pred)['rouge1'] .precision for ref, pred in zip(references, predictions)]
rouge1_scores_recall = [scorer.score(ref, pred)['rouge1'].recall for ref, pred in zip(references, predictions)]
rouge1_scores = [scorer.score(ref, pred)['rouge1'].fmeasure for ref, pred in zip(references, predictions)]

rouge1_precision = np.mean(rouge1_scores_precision)
rouge1_recall = np.mean(rouge1_scores_recall)
rouge1_fmeasure = np.mean(rouge1_scores)

print(f"Rouge-1 Precision: {rouge1_precision}")
print(f"Rouge-1 Recall: {rouge1_recall}")
print(f"Rouge-1 F-measure: {rouge1_fmeasure}")

Rouge-1 Precision: 0.5
Rouge-1 Recall: 0.5
Rouge-1 F-measure: 0.5


In [126]:
predictions

[' False',
 '1,345 m (4,411 ft)',
 '13 Reasons Why has a second season',
 ' Silicon Valley is a region in the southern San Francisco Bay Area of Northern California.',
 ' True',
 ' True',
 ' True',
 '',
 ' True',
 ' False']

In [123]:
0.967541864438604

0.967541864438604

In [124]:
0.743116414089967
0.7240112512226433
0.743116414089967
0.8757830833830799

0.8757830833830799

Fedavg

Rouge-1 Precision: 0.6004595779970863

Rouge-1 Recall: 0.9911330291403916

Rouge-1 F-measure: 0.7386883505534914


Cluster 3

Rouge-1 Precision: 0.43991448171893444

Rouge-1 Recall: 1.0

Rouge-1 F-measure: 0.6084780128127207


In [125]:
0.6191708966090461
0.5503595346990156

0.5503595346990156