In [1]:
import sys
import os
from tqdm import tqdm
import numpy as np
import torch
sys.path.append(".")
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from datasets import load_dataset
from accelerate import Accelerator
from torch.utils.data import DataLoader
from utils.template import TEMPLATE_DICT
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
template = TEMPLATE_DICT['alpaca'][0]
MODEL_NAME = 'TinyLlama/TinyLlama_v1.1'
DATASET_NAME = "CohereForAI/aya_dataset"
DEVICE = 'cuda:0'
EVALSET_LEN = 500

In [3]:
def state_model(path, round, cluster = -1):

    if cluster == -1:
        path = path + f'/checkpoint-{round}'
    else:
        path = path + f'/cluster_{cluster}_checkpoint-{round}'


    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16,
                                                quantization_config = BitsAndBytesConfig(
                                                                        load_in_4bit=True,
                                                                        bnb_4bit_use_double_quant=True,
                                                                        bnb_4bit_quant_type="nf4",
                                                                        bnb_4bit_compute_dtype=torch.bfloat16,
                                                                    ),
                                                device_map={"": Accelerator().local_process_index})
    
    model = PeftModel.from_pretrained(model, path).to(DEVICE)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False, device=DEVICE)

    return model, tokenizer

In [4]:
def load_eval_data(DATASET_NAME, EVALSET_LEN, languages):
    
    dataset = load_dataset(DATASET_NAME, split="train", )
    dataset = dataset.filter(lambda x: x['language'] in ['English', 'Swedish', 'German', 'Portuguese', 'Spanish'])
    dataset_splited = dataset.train_test_split(test_size= 0.2, seed=0)
    dataset_test = dataset_splited['test']
    dataset = dataset_test.filter(lambda x: x['language'] in languages)
    dataset_len = min(len(dataset), EVALSET_LEN)
    dataset = dataset.select(range(dataset_len))

    return dataset

In [5]:
def calcule_loss_in_dataset(dataset, model, tokenizer, batch_size=8):
    #create a new feature in dataset that represents the full text (input + targets)
    dataset = dataset.map(lambda x: {'text': x['inputs'] + x['targets']})

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    loss = 0
    for data in tqdm(dataloader):
        input_ids = tokenizer(data['text'], return_tensors='pt').input_ids.to(DEVICE)
        with torch.no_grad():
            output = model(input_ids, labels=input_ids)
            loss += output.loss.item()
    return loss/len(dataset)

In [6]:
def calculate_perplexity(instruction, output, model, tokenizer, device = DEVICE):
    # Combine instruction and output
    combined = f"{instruction} {output}"

    model = model.to(device)
    
    # Tokenize
    encodings = tokenizer(combined, return_tensors="pt", truncation=True, max_length=512)
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)
    
    # Calculate perplexity
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        
    return loss.item() #torch.exp(loss).item()


In [7]:
def calculate_perplexit_in_dataset(dataset, model, tokenizer, device = DEVICE):
    model.eval().to(device)
    perplexities = []

    for sample in tqdm(dataset):
        instruction = sample['inputs']
        output = sample['targets']

        perplexity = calculate_perplexity(instruction, output, model, tokenizer, device)
        perplexities.append(perplexity)

    # 5. Calculate mean perplexity
    mean_perplexity = np.mean(perplexities)
    std_perplexity = np.std(perplexities)

    return mean_perplexity, std_perplexity

def calculate_perplexity_in_dataset_in_batches(dataset, model, tokenizer, device = DEVICE, batch_size = 8):
    model.eval().to(device)
    perplexities = []

    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=4)

    for batch in tqdm(dataloader):
        instructions = batch['inputs']
        outputs = batch['targets']

        for instruction, output in zip(instructions, outputs):
            perplexity = calculate_perplexity(instruction, output, model, tokenizer, device)
            perplexities.append(perplexity)

    # 5. Calculate mean perplexity
    mean_perplexity = np.mean(perplexities)
    std_perplexity = np.std(perplexities)

    return mean_perplexity, std_perplexity

In [8]:
PATH = ['output/aya_dataset_400000_clustered_c20s2_i10_b16a1_l512_r8a16_20240922070346', 'output/aya_dataset_400000_clustered_c20s2_i10_b16a1_l512_r8a16_20240922062739']
EVAL_ROUNDS = [1, 50, 100, 200]
sim_round = 50

experiments = {'0': {'path': PATH[0],
                     'type': 'fedavg',
                     'rounds': EVAL_ROUNDS,
                     'results': []},
                     
                '1': {'path': PATH[1], 
                    'type': 'clustered',
                      'rounds': EVAL_ROUNDS,
                      'results': []}}

In [9]:
languages = ['English', 'Swedish', 'German', 'Portuguese', 'Spanish']

for lang in languages:
    
    print(f'Language: {lang}')
    dataset = load_eval_data(DATASET_NAME, EVALSET_LEN, [lang])

    for e in experiments:
        path = experiments[e]['path']
        
        if experiments[e]['type'] == 'fedavg':
            for round in experiments[e]['rounds']:

                if round <= sim_round:
                    model, tokenizer = state_model(path, round, cluster = -1)
                else:
                    model, tokenizer = state_model(path, round, cluster = 0)

                mean_perplexity, std_perplexity = calculate_perplexity_in_dataset_in_batches(dataset, model, tokenizer)
                exp_type = experiments[e]['type']
                print(f'Round: {round}, Path: {path}, Cluster: 0, Type: {exp_type} , Mean Perplexity: {mean_perplexity}, Std Perplexity: {std_perplexity}')

                experiments[e]['results'].append({'lang': lang, 'round': round, 'cluster': 0, 'mean_perplexity': mean_perplexity, 'std_perplexity': std_perplexity})
        
        if experiments[e]['type'] == 'clustered':

                for round in experiments[e]['rounds']:
                    if round <= sim_round:
                        model, tokenizer = state_model(path, round, cluster = -1)
                    for cluster in range(5):
                        if round <= sim_round:
                            model, tokenizer = state_model(path, round, cluster = -1)
                        else:
                            model, tokenizer = state_model(path, round, cluster = cluster)
                        
                        if round <= sim_round and cluster > 0: #calculate to only one cluster is already enough (since they are the same)
                            continue

                        mean_perplexity, std_perplexity = calculate_perplexity_in_dataset_in_batches(dataset, model, tokenizer)
                        exp_type = experiments[e]['type']
                        print(f'Round: {round}, Path: {path}, Cluster: {cluster}, Type: {exp_type} , Mean Perplexity: {mean_perplexity}, Std Perplexity: {std_perplexity}')

                        experiments[e]['results'].append({'lang': lang, 'round': round, 'cluster': cluster, 'mean_perplexity': mean_perplexity, 'std_perplexity': std_perplexity})

Language: English


Filter: 100%|██████████| 202362/202362 [00:04<00:00, 49309.49 examples/s]
Filter: 100%|██████████| 3670/3670 [00:00<00:00, 41796.81 examples/s]


KeyboardInterrupt: 

In [None]:
#save experiments results
with open('loss_results.json', 'w') as f:
    json.dump(experiments, f)

In [None]:
import json

In [None]:
experiments = json.load(open('loss_results.json', 'r'))

In [None]:
import pandas as pd
pd.DataFrame(experiments['0']['results'])

Unnamed: 0,lang,round,cluster,mean_perplexity,std_perplexity
0,English,1,0,58.198952,148.875317
1,English,50,0,54.221013,154.054723
2,English,100,0,61.609743,208.040364
3,English,200,0,61.855342,202.30525
4,Swedish,1,0,285.80154,508.55445
5,Swedish,50,0,106.173652,148.762695
6,Swedish,100,0,123.683618,182.566626
7,Swedish,200,0,140.607875,215.76256
8,German,1,0,92.207688,164.752815
9,German,50,0,179.282025,676.9785


In [None]:
pd.DataFrame(experiments['1']['results'])

Unnamed: 0,lang,round,cluster,mean_perplexity,std_perplexity
0,English,1,0,58.66185,150.168873
1,English,50,0,54.245254,156.982793
2,English,100,0,53.552095,147.766082
3,English,100,1,54.887487,166.477956
4,English,100,2,58.114222,178.885767
5,English,100,3,62.612469,214.585635
6,English,100,4,61.49271,213.434596
7,English,200,0,53.275978,144.172591
8,English,200,1,58.445066,194.135281
9,English,200,2,60.158384,195.870439
