In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm
import json
import copy
import random
import time
import torch
from torch import nn, cuda, optim
from torch.utils.data import DataLoader
import torch.nn.init as init

device = 'cuda' if cuda.is_available() else 'cpu'
if device == 'cuda':
    torch.cuda.empty_cache()
print(device)
from itertools import islice

try:
    from torchcrf import CRF
except:
    !pip install pytorch-crf
    from torchcrf import CRF

from transformers import (
    BertModel,
    BertForTokenClassification,
    BertTokenizerFast,
    AutoTokenizer,
    AutoModelForTokenClassification,
    get_linear_schedule_with_warmup
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    precision_recall_fscore_support,
    classification_report
)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings('ignore')

def generate_random_seed():
    return random.randint(1, 1000)

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


cuda
/kaggle/input/unit-segmentation-lstm-transformers/we.csv
/kaggle/input/unit-segmentation-lstm-transformers/mix1.csv
/kaggle/input/unit-segmentation-lstm-transformers/pe.csv
/kaggle/input/unit-segmentation-lstm-transformers/abam.csv
/kaggle/input/unit-segmentation-lstm-transformers/mix2.csv
/kaggle/input/unit-segmentation-lstm-transformers/ug.csv


In [3]:
configuration = {
    'train_tasks': ['pe', 'we', 'abam'],
    'task_weighting': 'dwa', # dwa or equal
    'test': ['pe', 'we', 'abam', 'ug', 'mix1'],
    'runs': 10, # 10
    'epochs': 10, # 10
    'train_batch_size': 32,
    'dev_batch_size': 32,
    'test_batch_size': 32,
    'label_list': ['O', 'B', 'I'],
    'model_checkpoint': 'bert-base-uncased',
    'lr': 1e-4, 
    'T': 3.0, # temperature for dwa weighting
}

In [4]:
""" Tokenize examples in batch
Since the tokenizer may divide each token into two or more subtokens, we must align the new tokens with the original labels.
New subtokens must have the same label than their parent token
Labels may be 0, 1 or 2 for O, B and I labels, respectively, and -100 for complementary tokens, such PAD, SEP, CLS tokens.
Loss functions will ignore labels with value -100, so the loss only considers mistakes at the positions of real input (sub)tokens.
"""
def tokenize_and_align_labels(txts, lbls, tokenizer, max_len = 128, mapping = None):

    tokenized_inputs = tokenizer(txts, is_split_into_words=True,
                                 max_length = max_len, padding = 'max_length', truncation=True,
                                 return_tensors = 'pt')

    labels = []
    for i, label in enumerate(lbls):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        previous_label = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append('O')
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
                previous_label = label[word_idx]
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                new_label = 'O'
                if previous_label == 'O':
                    new_label = 'O'
                    # label_ids.append('O')
                else:
                    suffix_label = label[word_idx][1:]
                    new_label = 'I'+suffix_label
                label_ids.append(new_label)
                previous_label = new_label

            previous_word_idx = word_idx

        labels.append(label_ids)

    if mapping is not None:
        labels = [list(map(lambda x : mapping.index(x), x)) for x in labels]

    return tokenized_inputs, labels

"""
Return text tokens from the tokenizer given the numeric input ids
"""
def get_tokens_from_ids(input_ids):

    return [tokenizer.convert_ids_to_tokens(tl) for tl in input_ids]

"""
Remove part of predicted sequences corresponding to padding tokens.
"""
def remove_padding_from_predictions(predictions, batch_attention_mask):
    valid_predictions_list = []
    for instance_preds, att_mask in zip(predictions, batch_attention_mask):
        valid = [pred for pred, mask in zip(instance_preds, att_mask) if mask == 1]
        valid_predictions_list.append(valid[1:-1])
        
    return valid_predictions_list

def remove_padding_and_get_tokens(batch_ids, batch_attention_mask):
    valid_ids_list = []
    for instances_ids, att_mask in zip(batch_ids, batch_attention_mask):
        valid = [ids for ids, mask in zip(instances_ids, att_mask) if mask == 1]
        valid_ids_list.append(valid[1:-1])
    
    valid_tokens = get_tokens_from_ids(valid_ids_list)
    return valid_tokens

"""
Maps sequences of integer to sequences of BIO tags
"""
def integer_to_bio(labels, mapping):
    return [[mapping[int(x)] for x in l] for l in labels]

"""
Transforms list of predicted sequences to a flat list of labels.
"""
def flatten_predictions(labels):
    return [j for sub in labels for j in sub]

"""
Generates txt file with tokens, labels and predictions. Estilo FLAiR NLP.
"""
def generate_results_txt(tokens, labels, predictions, output_file_name):
    
    with open(output_file_name, 'w', encoding = 'utf-8') as nf:

        for tks, lbs, prds in zip(tokens, labels, predictions):
            for tk, lb, pr in zip(tks, lbs, prds):
                nf.write(f"{tk} {lb} {pr}\n")

            nf.write(f"\n")

"""
Dataset class for sequence labeling
"""
class SequenceLabelingDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, label_list):
        MAX_LEN = 128
        lb = [x.split() for x in df.labels.values.tolist()]
        txt = [i.split() for i in df.tokens.values.tolist()]
        self.encodings, self.labels = tokenize_and_align_labels(txt,
                                                                lb,
                                                                tokenizer,
                                                                max_len = MAX_LEN,
                                                                mapping = label_list)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
class MultitaskDataLoader(torch.utils.data.DataLoader):

    def __init__(self, dataloaders):
        self.dataloaders = dataloaders
        self.min_length = min([len(d) for d in dataloaders])
        self.lengths = [min(len(d), self.min_length) for d in dataloaders]
        self.iterators = None
        indices = [[i] * v for i, v in enumerate(self.lengths)]
        self.task_indices = sum(indices, [])

    def _reset(self):
        random.shuffle(self.task_indices)
        self.current_index = 0

    def __iter__(self):
        self._reset()
        self.iterators = [iter(d) for d in self.dataloaders]
        return self

    def __len__(self):
        return sum(self.lengths)

    def __next__(self):
        if self.current_index < len(self.task_indices):
            task_index = self.task_indices[self.current_index]
            batch = next(self.iterators[task_index])
            self.current_index += 1
            return batch, task_index
        else:
            raise StopIteration
            
def initialize_crf_layer(layer):
    for name, param in layer.named_parameters():
        if "transitions" in name:
            # Initialize transitions matrix with distinct values
            init.uniform_(param, a=0.5, b=1.5)
    
    
class MultiTaskBERTCRFModel(nn.Module):
    def __init__(self, model_checkpoint, num_labels = 3):
        super(MultiTaskBERTCRFModel, self).__init__()
        self.model_checkpoint = model_checkpoint
        self.num_labels = num_labels
        
        self.transf = AutoModelForTokenClassification.from_pretrained(
            self.model_checkpoint, 
            num_labels = self.num_labels
        )
        
        self.crf_1 = CRF(self.num_labels, batch_first=True)
        self.crf_2 = CRF(self.num_labels, batch_first=True)
        self.crf_3 = CRF(self.num_labels, batch_first=True)
        
        # Initialize each CRF layer using the custom method
        initialize_crf_layer(self.crf_1)
        initialize_crf_layer(self.crf_2)
        initialize_crf_layer(self.crf_3)

    def forward(self, input_ids, attention_mask, token_type_ids = None, labels = None, task_id = 0):
        task_id = 0 if task_id > 2 else task_id
        
        if token_type_ids is not None:
            outputs = self.transf(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
        else:
            outputs = self.transf(input_ids = input_ids, attention_mask = attention_mask)
            
        logits = outputs.logits
        
        if task_id == 0:
            if labels is not None:
                loss = -self.crf_1(logits, labels, mask=attention_mask.byte(), reduction = 'token_mean')
                decoding = self.crf_1.decode(logits)
                return loss, decoding
            else:
                return self.crf_1.decode(logits)
        elif task_id == 1:
            if labels is not None:
                loss = -self.crf_2(logits, labels, mask=attention_mask.byte(), reduction = 'token_mean')
                decoding = self.crf_2.decode(logits)
                return loss, decoding
            else:
                return self.crf_2.decode(logits)
        elif task_id == 2:
            if labels is not None:
                loss = -self.crf_3(logits, labels, mask=attention_mask.byte(), reduction = 'token_mean')
                decoding = self.crf_3.decode(logits)
                return loss, decoding
            else:
                return self.crf_3.decode(logits)


In [5]:
def load_training_data(tasks, tokenizer, TRAIN_BATCH_SIZE = 32, DEV_BATCH_SIZE = 32):
    
    dataloaders = []
    
    min_train_instances = float("inf")
    min_dev_instances = float("inf")
    
    train_dfs, dev_dfs = [], []
    
    for df_name in tasks:
        df = pd.read_csv(f'/kaggle/input/unit-segmentation-lstm-transformers/{df_name}.csv')
        train_seq_df, dev_seq_df = None, None
        if 'dev' in df.set.unique():
            train_seq_df = df.loc[df['set'] == 'train']
            dev_seq_df = df.loc[df['set'] == 'dev']
        else:
            train_seq_df = df.loc[df['set'] == 'train']
            train_seq_df, dev_seq_df = train_test_split(train_seq_df, test_size = 0.1, random_state = 2023)
            
        train_dfs.append(train_seq_df)
        dev_dfs.append(dev_seq_df)
            
        if len(train_seq_df) < min_train_instances:
            min_train_instances = len(train_seq_df)
        if len(dev_seq_df) < min_dev_instances:
            min_dev_instances = len(dev_seq_df)
            
    for train_seq_df, dev_seq_df in zip(train_dfs, dev_dfs):
        train_seq_df = train_seq_df.sample(n = min_train_instances)
        dev_seq_df = dev_seq_df.sample(n = min_dev_instances)
        print(train_seq_df.shape, dev_seq_df.shape)
        
        # PYTORCH DATASETS
        train_dataset = SequenceLabelingDataset(train_seq_df, tokenizer, label_list)
        val_dataset = SequenceLabelingDataset(dev_seq_df, tokenizer, label_list)

        # DATALOADERS
        train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=DEV_BATCH_SIZE, shuffle=True)
        
        dataloaders.append((train_loader, val_loader))
        
    return dataloaders


# def load_training_data(tasks, tokenizer, TRAIN_BATCH_SIZE = 32, DEV_BATCH_SIZE = 32):
    
#     dataloaders = []
#     min_train_instances = float("inf")
#     min_dev_instances = float("inf")
    
#     for df_name in tasks:
#         df = pd.read_csv(f'/kaggle/input/unit-segmentation-lstm-transformers/{df_name}.csv')
        
#         train_seq_df = df.loc[df['set'] == 'train']
#         train_seq_df, dev_seq_df = train_test_split(train_seq_df, test_size = 0.1, random_state = 2023)
        
#         if len(train_seq_df) < min_train_instances:
#             min_train_instances = len(train_seq_df)
#         if len(dev_seq_df) < min_dev_instances:
#             min_dev_instances = len(dev_seq_df)
                
#     for i, df_name in enumerate(tasks, start = 1):
    
#         # SEQUENCE LABELING DATASET
#         df = pd.read_csv(f'/kaggle/input/unit-segmentation-lstm-transformers/{df_name}.csv')

#         train_seq_df, dev_seq_df = None, None

#         if 'dev' in df.set.unique():
#             train_seq_df = df.loc[df['set'] == 'train']
#             dev_seq_df = df.loc[df['set'] == 'dev']
#         else:
#             train_seq_df = df.loc[df['set'] == 'train']
#             train_seq_df, dev_seq_df = train_test_split(train_seq_df, test_size = 0.1, random_state = 2023)
        
#         train_seq_df = train_seq_df.sample(n = min_train_instances)
#         dev_seq_df = dev_seq_df.sample(n = min_dev_instances)

#         print(train_seq_df.shape, dev_seq_df.shape)
    
#         # PYTORCH DATASETS
#         train_dataset = SequenceLabelingDataset(train_seq_df, tokenizer, label_list)
#         val_dataset = SequenceLabelingDataset(dev_seq_df, tokenizer, label_list)

#         # DATALOADERS
#         train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
#         val_loader = DataLoader(val_dataset, batch_size=DEV_BATCH_SIZE, shuffle=True)
        
#         dataloaders.append((train_loader, val_loader))
    
#     return dataloaders
    
def load_testing_data(df_name, tokenizer, TEST_BATCH_SIZE = 32):

    # SEQUENCE LABELING DATASET
    df = pd.read_csv(f'/kaggle/input/unit-segmentation-lstm-transformers/{df_name}.csv')
    
    test_seq_df = df.loc[df['set'] == 'test']
    
    # PYTORCH DATASETS
    test_dataset = SequenceLabelingDataset(test_seq_df, tokenizer, label_list)
    
    # DATALOADERS
    test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=False)
    
    return test_loader

In [6]:
def train_model(model, train_loader, optimizer, lambda_weight, epoch):

    model.train()

    train_loss = 0
    cost_list = np.zeros(3, dtype=np.float32) # ver 3
    num_batches_per_task = np.zeros(3, dtype=np.int32) # ver 3

    for batch, task_id in train_loader:
        
        batch = tuple(v.to(device) for t, v in batch.items())
        loss, outputs = None, None
        
        if model_checkpoint.startswith('bert'):
            batch_input_ids, batch_token_type_ids, batch_attention_mask, batch_labels = batch
            loss, outputs = model(batch_input_ids, 
                                  token_type_ids = batch_token_type_ids,
                                  attention_mask = batch_attention_mask, 
                                  labels = batch_labels,
                                    task_id = task_id)
        else:
            batch_input_ids, batch_attention_mask, batch_labels = batch
            loss, outputs = model(batch_input_ids, 
                                  attention_mask = batch_attention_mask, 
                                  labels = batch_labels, task_id = task_id)

        
        loss = lambda_weight[task_id, epoch] * loss

        train_loss += loss.item()
        
        cost_list[task_id] += loss.item()
        num_batches_per_task[task_id] += 1
        
        # backprop
        optimizer.zero_grad()
        
        loss.backward()
        
#         print("TASK", task_id)
        
#         # Print gradients for CRF layers only
#         for name, param in model.named_parameters():
#             if 'crf' in name and param.grad is not None:
#                 print(f"Parameter {name}:")
#                 print(param.grad)
#             else:
#                 print(f"Parameter {name} has no gradient")
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)

        optimizer.step()

    avg_train_loss = train_loss / len(train_loader)
    avg_cost = [round((w/t), 5) for w, t in zip(cost_list, num_batches_per_task)]
    return avg_train_loss, avg_cost

def evaluate_model(model, dataloader, lambda_weight, epoch):

    model.eval()

    eval_loss = 0
    cost_list = np.zeros(3, dtype=np.float32)
    num_batches_per_task = np.zeros(3, dtype=np.int32)
    eval_labels, eval_predictions = [], []
    
    with torch.no_grad():
        for batch, task_id in dataloader:
            batch = tuple(v.to(device) for t, v in batch.items())
            loss, outputs = None, None
            
            if model_checkpoint.startswith('bert'):
                batch_input_ids, batch_token_type_ids, batch_attention_mask, batch_labels = batch
                loss, outputs = model(batch_input_ids, 
                                      token_type_ids = batch_token_type_ids,
                                      attention_mask = batch_attention_mask, 
                                      labels = batch_labels)
            else:
                batch_input_ids, batch_attention_mask, batch_labels = batch
                loss, outputs = model(batch_input_ids, 
                                      attention_mask = batch_attention_mask, 
                                      labels = batch_labels)
            
            loss = lambda_weight[task_id, epoch] * loss
            eval_loss += loss.item()
            
            cost_list[task_id] += loss.item()
            num_batches_per_task[task_id] += 1
            
            valid_labels = remove_padding_from_predictions(batch_labels.detach().cpu().numpy(), batch_attention_mask.detach().cpu().numpy())
            eval_labels += valid_labels
            
            valid_predictions = remove_padding_from_predictions(outputs, batch_attention_mask.detach().cpu().numpy())
            eval_predictions += valid_predictions
    
    flattened_labels = flatten_predictions(eval_labels)
    flattened_predictions = flatten_predictions(eval_predictions)
    
    eval_f1 = f1_score(flattened_labels, flattened_predictions, average = 'macro')
    avg_cost = [round((w/t), 5) for w, t in zip(cost_list, num_batches_per_task)]
    return eval_loss / len(dataloader), avg_cost, eval_f1

def test_model(model, dataloader, task_id, write_files = False, path_file = None):
    
    macro_f1_in = 0
    
    testing_info = []
    
    for crf_i in range(3):

        model.eval()

        eval_tokens, eval_labels, eval_predictions = [], [], []

        with torch.no_grad():
            for batch in dataloader:
                batch = tuple(v.to(device) for t, v in batch.items())
                loss, outputs = None, None

                if model_checkpoint.startswith('bert'):
                    batch_input_ids, batch_token_type_ids, batch_attention_mask, batch_labels = batch
                    _, outputs = model(batch_input_ids, 
                                       token_type_ids = batch_token_type_ids,
                                       attention_mask = batch_attention_mask, 
                                       labels = batch_labels,
                                       task_id = crf_i)
                else:
                    batch_input_ids, batch_attention_mask, batch_labels = batch
                    _, outputs = model(batch_input_ids, 
                                       attention_mask = batch_attention_mask, 
                                       labels = batch_labels,
                                       task_id = crf_i)

                valid_labels = remove_padding_from_predictions(batch_labels.detach().cpu().numpy(), batch_attention_mask.detach().cpu().numpy())
                eval_labels += valid_labels

                valid_predictions = remove_padding_from_predictions(outputs, batch_attention_mask.detach().cpu().numpy())
                eval_predictions += valid_predictions

                valid_tokens = remove_padding_and_get_tokens(batch_input_ids.detach().cpu().numpy(), 
                                                             batch_attention_mask.detach().cpu().numpy())
                eval_tokens += valid_tokens
            
        flattened_labels = flatten_predictions(eval_labels)
        flattened_predictions = flatten_predictions(eval_predictions)
        report_info = classification_report(flattened_labels, flattened_predictions, target_names = label_list, output_dict = True)
        accuracy, macro_f1 = report_info['accuracy'], report_info['macro avg']['f1-score']
        o_f1, b_f1, i_f1 =  report_info['O']['f1-score'], report_info['B']['f1-score'], report_info['I']['f1-score']
        testing_info.append((len(flattened_predictions), accuracy, macro_f1, o_f1, b_f1, i_f1, crf_i))

        if crf_i == task_id:
            macro_f1_in = macro_f1
        
        if write_files:
            generate_results_txt(eval_tokens, eval_labels, eval_predictions, path_file + f'-CRF{crf_i}.txt')
    
    return testing_info, macro_f1_in

In [7]:
# MODEL CONFIGURATION
model_checkpoint = configuration['model_checkpoint']
model_name = model_checkpoint.split('-')[0]

label_list = configuration['label_list']
num_labels = len(label_list)

# TRAINING CONFIGURATION
RUNS = configuration['runs']
EPOCHS = configuration['epochs']

TRAIN_BATCH_SIZE = configuration['train_batch_size']
TEST_BATCH_SIZE = configuration['test_batch_size']
DEV_BATCH_SIZE = configuration['dev_batch_size']

train_tasks = configuration['train_tasks']
test_dfs = configuration['test']

weighting_strategy = configuration['task_weighting']
T = configuration['T'] # temperature for DWA

# MISCELLANEOUS
SAVE_INFORMATION = False
SAVE_BEST_MODEL = False

# TOKENIZER
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# ======================================== #
training_info, testing_results, models_info = [], [], []
start_time = time.time()
tasks_weights_in_training = []

for nrun in range(RUNS):
    # Initialize
    rs = generate_random_seed()
    set_random_seed(rs)
    
    best_eval_loss = float('inf')
    best_epoch = 0
    best_model_state = None
    
    # Dataloaders
    loaders = load_training_data(train_tasks, tokenizer, 
                                 TRAIN_BATCH_SIZE=TRAIN_BATCH_SIZE, 
                                 DEV_BATCH_SIZE=DEV_BATCH_SIZE)
    
    t1_train_loader, t1_dev_loader = loaders[0]
    t2_train_loader, t2_dev_loader = loaders[1]
    t3_train_loader, t3_dev_loader = loaders[2]
    
    print(len(t1_train_loader), len(t2_train_loader), len(t3_train_loader))
    print(len(t1_dev_loader), len(t2_dev_loader), len(t3_dev_loader))

    combined_dataloader = MultitaskDataLoader([t1_train_loader, t2_train_loader, t3_train_loader])
    combined_dev_dataloader = MultitaskDataLoader([t1_dev_loader, t2_dev_loader, t3_dev_loader])
    
    print(len(combined_dataloader), len(combined_dev_dataloader))
        
    # Create model
    tagger = MultiTaskBERTCRFModel(model_checkpoint, num_labels = num_labels)
    tagger.to(device)
    
    # OPTIMIZER
    optimizer = torch.optim.AdamW(tagger.parameters(), lr = configuration['lr'], eps = 1e-8)
    
    avg_cost = np.zeros([EPOCHS, 3], dtype=np.float32) # ver 3
    lambda_weight = np.ones([3, EPOCHS])

    # Training loop
    for index in range(EPOCHS):
        print(f"{index+1}/{EPOCHS}")
                
        if weighting_strategy == 'dwa':
            if index == 0 or index == 1:
                lambda_weight[:, index] = 1.0
            else:
                w_1 = avg_cost[index - 1, 0] / avg_cost[index - 2, 0]
                w_2 = avg_cost[index - 1, 1] / avg_cost[index - 2, 1]
                w_3 = avg_cost[index - 1, 2] / avg_cost[index - 2, 2]
                lambda_weight[0, index] = 3 * np.exp(w_1 / T) / (np.exp(w_1 / T) + np.exp(w_2 / T) + np.exp(w_3 / T))
                lambda_weight[1, index] = 3 * np.exp(w_2 / T) / (np.exp(w_1 / T) + np.exp(w_2 / T) + np.exp(w_3 / T))
                lambda_weight[2, index] = 3 * np.exp(w_3 / T) / (np.exp(w_1 / T) + np.exp(w_2 / T) + np.exp(w_3 / T))
                
        elif weighting_strategy == 'equal':
            lambda_weight[:, index] = 1.0
        
        train_loss, train_costs = train_model(tagger, combined_dataloader, optimizer, lambda_weight, index)
        avg_cost[index, :] = train_costs
                
        # evaluate model
        eval_loss, eval_costs, eval_f1 = evaluate_model(tagger, combined_dev_dataloader, lambda_weight, index)
        
        training_info.append((nrun, rs, weighting_strategy, index, train_loss, eval_loss, eval_f1)) # nrun, epoch, train_loss, eval_loss, eval_f1
        
        # save best model based on validation loss
        if eval_loss < best_eval_loss:
            best_eval_loss = eval_loss
            best_epoch = index
            best_model_state = copy.deepcopy(tagger.state_dict())
            
        print("Train loss:", train_loss, " - Eval loss:", eval_loss)
            
    print(f"Best epoch: {best_epoch} - Validation loss: {best_eval_loss} [Run: {nrun}]")
        
    avg_costs_dataframe = pd.DataFrame(avg_cost, columns = train_tasks)
    avg_costs_dataframe['epoch'] = [i for i in range(EPOCHS)]
    avg_costs_dataframe['run'] = nrun
    avg_costs_dataframe.to_csv(f'avgcosts-mtl-{weighting_strategy}-{model_name}-{nrun}.csv', index = False)
    lambda_weights_dataframe = pd.DataFrame(lambda_weight, columns = [f'epoch_{i}' for i in range(EPOCHS)])
    lambda_weights_dataframe['task'] = train_tasks
    lambda_weights_dataframe['run'] = nrun
    lambda_weights_dataframe.to_csv(f'weigths-mtl-{weighting_strategy}-{model_name}-{nrun}.csv', index = False)
    
    # Testing
    # Loading best model
    best_tagger = None
    best_tagger = MultiTaskBERTCRFModel(model_checkpoint, num_labels = num_labels)
    best_tagger.load_state_dict(best_model_state)
    best_tagger.to(device)
    eval_results = evaluate_model(best_tagger, combined_dev_dataloader, lambda_weight, best_epoch)
    print(eval_results) # check model
    
    macros = []
        
    for test_i, test_df in enumerate(test_dfs[:3]):
        test_loader = load_testing_data(test_df, tokenizer, TEST_BATCH_SIZE = TEST_BATCH_SIZE)
        path_test_results_file = f'results-mtl-{weighting_strategy}-{test_df}-{model_name}-{nrun}'
        
        testing_info, macro_f1_in = test_model(best_tagger, test_loader, test_i, write_files = True, path_file = path_test_results_file)
        macros.append(macro_f1_in)
        testing_info = [[nrun, test_df, weighting_strategy] + list(l) for l in testing_info]
        testing_results += testing_info
        
    print(f"Test Macros F1: {test_dfs[:3]}: {macros} [Run: {nrun}]")
    
    if SAVE_BEST_MODEL:
        model_path = f"model-mtl-{weighting_strategy}-{model_name}-{nrun}.pt"
        models_info.append((nrun, model_path))
        if best_model_state is not None:
            torch.save(best_model_state, model_path)
        else:
            torch.save(tagger.state_dict(), model_path)
    
# Save data
if SAVE_INFORMATION:
    models_file_name = f"models-mtl-{weighting_strategy}-{model_name}.csv"
    pd.DataFrame(models_info, columns = ['run', 'model_file']).to_csv(models_file_name, index = False)

    train_file_name = f'train-info-mtl-{weighting_strategy}-{model_name}.csv'
    pd.DataFrame(training_info, columns = ['run', 'seed', 'weighting', 'epoch', 'train_loss', 'eval_loss', 'eval_f1']).to_csv(train_file_name, index = False)

    test_file_name = f'test-info-mtl-{weighting_strategy}-{model_name}.csv'
    pd.DataFrame(testing_results, columns = ['run', 'test', 'weighting', 'sequences', 'accuracy', 'macro-f1', 'O-f1', 'B-f1', 'I-f1', 'crf_number']).to_csv(test_file_name, index = False)

print(f"Total time: {((time.time() - start_time)//60)+1} minutes.")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(3170, 4) (353, 4)
(3170, 4) (353, 4)
(3170, 5) (353, 5)
100 100 100
12 12 12
300 36


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

1/10
Train loss: 0.44550098677476246  - Eval loss: 0.30690746899280286
2/10
Train loss: 0.2806907516469558  - Eval loss: 0.3860560388291358
3/10
Train loss: 0.16045961274765433  - Eval loss: 0.44689356561543214
4/10
Train loss: 0.09400857888860628  - Eval loss: 0.5031002562532699
5/10
Train loss: 0.06445036849996541  - Eval loss: 0.6319211005078008
6/10
Train loss: 0.05173601178297152  - Eval loss: 0.6103593868526028
7/10
Train loss: 0.03513491220030119  - Eval loss: 0.6727559263688616
8/10
Train loss: 0.03450004195192984  - Eval loss: 0.6918535090864881
9/10
Train loss: 0.026216724421828985  - Eval loss: 0.7900057955654726
10/10
Train loss: 0.02321261842303405  - Eval loss: 0.7264539451377156
Best epoch: 0 - Validation loss: 0.30690746899280286 [Run: 0]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

(0.3077634213388794, [0.34187, 0.1713, 0.41013], 0.7769783553714902)
Test Macros F1: ['pe', 'we', 'abam']: [0.8422613134611119, 0.7702394298907072, 0.7435949583191984] [Run: 0]
(3170, 4) (353, 4)
(3170, 4) (353, 4)
(3170, 5) (353, 5)
100 100 100
12 12 12
300 36


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

1/10
Train loss: 0.3126148347556591  - Eval loss: 0.22415619767788383
2/10
Train loss: 0.1909985652503868  - Eval loss: 0.2249113597596685
3/10
Train loss: 0.11216709988812605  - Eval loss: 0.4574094511982467
4/10
Train loss: 0.06454714872253438  - Eval loss: 0.4774934549867693
5/10
Train loss: 0.04678307459418041  - Eval loss: 0.36700395799966323
6/10
Train loss: 0.03520233413573199  - Eval loss: 0.3917095427799116
7/10
Train loss: 0.026660833528536994  - Eval loss: 0.45284169888408443
8/10
Train loss: 0.021338330280632364  - Eval loss: 0.4420227291516817
9/10
Train loss: 0.019200407678241996  - Eval loss: 0.5053682688281292
10/10
Train loss: 0.017358188151993092  - Eval loss: 0.535389780045888
Best epoch: 0 - Validation loss: 0.22415619767788383 [Run: 1]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

(0.2368345713799095, [0.31361, 0.09698, 0.29991], 0.7921509286323133)
Test Macros F1: ['pe', 'we', 'abam']: [0.8080804747562039, 0.8339199059540054, 0.7421149812247871] [Run: 1]
(3170, 4) (353, 4)
(3170, 4) (353, 4)
(3170, 5) (353, 5)
100 100 100
12 12 12
300 36


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

1/10
Train loss: 0.42023673601448536  - Eval loss: 0.3014395504982935
2/10
Train loss: 0.2621694042844077  - Eval loss: 0.29523990743069184
3/10
Train loss: 0.15384310921809324  - Eval loss: 0.32482142829879496
4/10
Train loss: 0.08928708138875663  - Eval loss: 0.48083058984613875
5/10
Train loss: 0.06992537755907202  - Eval loss: 0.5901630664011464
6/10
Train loss: 0.04657700938648001  - Eval loss: 0.5379894433984495
7/10
Train loss: 0.04093290744504581  - Eval loss: 0.6246826580285819
8/10
Train loss: 0.031211521718457032  - Eval loss: 0.6891202016397275
9/10
Train loss: 0.03394281615037471  - Eval loss: 0.700527346563629
10/10
Train loss: 0.029198422427337695  - Eval loss: 0.7729037159006111
Best epoch: 1 - Validation loss: 0.29523990743069184 [Run: 2]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

(0.3493618124889003, [0.53711, 0.14184, 0.36914], 0.8150114982325025)
Test Macros F1: ['pe', 'we', 'abam']: [0.8278354460211319, 0.8185976667282882, 0.7596384741669647] [Run: 2]
(3170, 4) (353, 4)
(3170, 4) (353, 4)
(3170, 5) (353, 5)
100 100 100
12 12 12
300 36


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

1/10
Train loss: 0.3501421345522006  - Eval loss: 0.24860207133719492
2/10
Train loss: 0.21771106708794832  - Eval loss: 0.28005715273320675
3/10
Train loss: 0.13145439279576143  - Eval loss: 0.289727309692858
4/10
Train loss: 0.07796900941214213  - Eval loss: 0.3644715774312822
5/10
Train loss: 0.04820972866378725  - Eval loss: 0.3799394837211973
6/10
Train loss: 0.03857670333003625  - Eval loss: 0.38838657189787934
7/10
Train loss: 0.03372365671636847  - Eval loss: 0.5592354876070103
8/10
Train loss: 0.026279408061139597  - Eval loss: 0.5082901803139571
9/10
Train loss: 0.02351268137562632  - Eval loss: 0.593562066115232
10/10
Train loss: 0.020355944226127274  - Eval loss: 0.581841959950705
Best epoch: 0 - Validation loss: 0.24860207133719492 [Run: 3]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

(0.2721691444102261, [0.31298, 0.12573, 0.3778], 0.8113280531227373)
Test Macros F1: ['pe', 'we', 'abam']: [0.8506694511073448, 0.8020140960627321, 0.7648885492863386] [Run: 3]
(3170, 4) (353, 4)
(3170, 4) (353, 4)
(3170, 5) (353, 5)
100 100 100
12 12 12
300 36


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

1/10
Train loss: 0.3584136017970741  - Eval loss: 0.38599679836382467
2/10
Train loss: 0.21994977737466495  - Eval loss: 0.2919697820778108
3/10
Train loss: 0.13223541631673774  - Eval loss: 0.34949722617036766
4/10
Train loss: 0.07763396651561683  - Eval loss: 0.4526403410264821
5/10
Train loss: 0.05129751736142983  - Eval loss: 0.4831337952689662
6/10
Train loss: 0.03556253355287481  - Eval loss: 0.5117092313755873
7/10
Train loss: 0.030704230119202595  - Eval loss: 0.520859638645359
8/10
Train loss: 0.026979097493070488  - Eval loss: 0.6313486301928101
9/10
Train loss: 0.02181126799934039  - Eval loss: 0.7139804110699212
10/10
Train loss: 0.017753180345898727  - Eval loss: 0.5914129651488716
Best epoch: 1 - Validation loss: 0.2919697820778108 [Run: 4]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

(0.2842998511882292, [0.35442, 0.11825, 0.38023], 0.8012670897786336)
Test Macros F1: ['pe', 'we', 'abam']: [0.8402657884720225, 0.8008840707041269, 0.7059588656607613] [Run: 4]
(3170, 4) (353, 4)
(3170, 4) (353, 4)
(3170, 5) (353, 5)
100 100 100
12 12 12
300 36


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

1/10
Train loss: 0.4126442242413759  - Eval loss: 0.26999193740387756
2/10
Train loss: 0.2403358727445205  - Eval loss: 0.28232528150288594
3/10
Train loss: 0.13568408231871823  - Eval loss: 0.3332904183642111
4/10
Train loss: 0.08340209594442664  - Eval loss: 0.4483463304422912
5/10
Train loss: 0.055148831298574806  - Eval loss: 0.5385327821333905
6/10
Train loss: 0.03973786185165712  - Eval loss: 0.5714738277407984
7/10
Train loss: 0.035112587004162685  - Eval loss: 0.5215531334365046
8/10
Train loss: 0.028750015837722458  - Eval loss: 0.4804880033956983
9/10
Train loss: 0.02442344332506764  - Eval loss: 0.6234064837110256
10/10
Train loss: 0.02301453563491426  - Eval loss: 0.6787409901606023
Best epoch: 0 - Validation loss: 0.26999193740387756 [Run: 5]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

(0.2833687048405409, [0.34157, 0.17038, 0.33816], 0.7844063571698139)
Test Macros F1: ['pe', 'we', 'abam']: [0.8456047363174249, 0.7490021245562289, 0.7387697127109744] [Run: 5]
(3170, 4) (353, 4)
(3170, 4) (353, 4)
(3170, 5) (353, 5)
100 100 100
12 12 12
300 36


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

1/10
Train loss: 0.39561032368491095  - Eval loss: 0.4180910562475522
2/10
Train loss: 0.23978991178795694  - Eval loss: 0.35811405152910286
3/10
Train loss: 0.13343911397581298  - Eval loss: 0.6073508027734028
4/10
Train loss: 0.08411357692598055  - Eval loss: 0.46293182068297434
5/10
Train loss: 0.057800272167660295  - Eval loss: 0.49612084962023395
6/10
Train loss: 0.04407473668882934  - Eval loss: 0.5103008135014938
7/10
Train loss: 0.033699138983502054  - Eval loss: 0.6065571972802799
8/10
Train loss: 0.03328276328276843  - Eval loss: 0.6152946677401714
9/10
Train loss: 0.024222516080966063  - Eval loss: 0.7477710435632616
10/10
Train loss: 0.025116460831074317  - Eval loss: 0.6937936562212094
Best epoch: 1 - Validation loss: 0.35811405152910286 [Run: 6]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

(0.3138275498317348, [0.37352, 0.14986, 0.4181], 0.7988352390598198)
Test Macros F1: ['pe', 'we', 'abam']: [0.8198051795939513, 0.7836124743884357, 0.7421307257596453] [Run: 6]
(3170, 4) (353, 4)
(3170, 4) (353, 4)
(3170, 5) (353, 5)
100 100 100
12 12 12
300 36


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

1/10
Train loss: 0.4203545810189098  - Eval loss: 0.6169564652567109
2/10
Train loss: 0.24906374450773  - Eval loss: 0.331402131731415
3/10
Train loss: 0.15008310082290943  - Eval loss: 0.35586127667273915
4/10
Train loss: 0.08898429700483879  - Eval loss: 0.37865799298096037
5/10
Train loss: 0.05940392278212433  - Eval loss: 0.45028563324982923
6/10
Train loss: 0.040962743694350746  - Eval loss: 0.5612716351428794
7/10
Train loss: 0.037578401261853286  - Eval loss: 0.47605513483964995
8/10
Train loss: 0.0286256238851153  - Eval loss: 0.5431915904987868
9/10
Train loss: 0.024485124061757232  - Eval loss: 0.5593932690822435
10/10
Train loss: 0.019492080453637754  - Eval loss: 0.7818419095511773
Best epoch: 1 - Validation loss: 0.331402131731415 [Run: 7]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

(0.2651485903447287, [0.33923, 0.12155, 0.33466], 0.8012124087034955)
Test Macros F1: ['pe', 'we', 'abam']: [0.8442200966481161, 0.7611305571773176, 0.7575510953926896] [Run: 7]
(3170, 4) (353, 4)
(3170, 4) (353, 4)
(3170, 5) (353, 5)
100 100 100
12 12 12
300 36


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

1/10
Train loss: 0.3774295088897149  - Eval loss: 0.3393050142460399
2/10
Train loss: 0.2393581862250964  - Eval loss: 0.33834828798555666
3/10
Train loss: 0.13528249049559235  - Eval loss: 0.3408630297312306
4/10
Train loss: 0.07906326825652893  - Eval loss: 0.48692731175429393
5/10
Train loss: 0.05427361006693294  - Eval loss: 0.4854768298876782
6/10
Train loss: 0.040277806699159556  - Eval loss: 0.6464886435343133
7/10
Train loss: 0.03548189205709302  - Eval loss: 0.5739856914717367
8/10
Train loss: 0.029478352855270108  - Eval loss: 0.6079479379792853
9/10
Train loss: 0.028138346525180775  - Eval loss: 0.6537551864506289
10/10
Train loss: 0.024350046224426478  - Eval loss: 0.6062682763594138
Best epoch: 1 - Validation loss: 0.33834828798555666 [Run: 8]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

(0.3235190527824064, [0.40255, 0.09898, 0.46902], 0.7707435426116559)
Test Macros F1: ['pe', 'we', 'abam']: [0.8085872118126725, 0.8450052710175969, 0.6847134101597928] [Run: 8]
(3170, 4) (353, 4)
(3170, 4) (353, 4)
(3170, 5) (353, 5)
100 100 100
12 12 12
300 36


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

1/10
Train loss: 0.42461799384405213  - Eval loss: 0.5602964444292916
2/10
Train loss: 0.26658609326928856  - Eval loss: 0.4199461299253421
3/10
Train loss: 0.15629082397557795  - Eval loss: 0.4649106189349873
4/10
Train loss: 0.09544061477819923  - Eval loss: 0.5997527891563045
5/10
Train loss: 0.06366249979007989  - Eval loss: 0.5229512815664444
6/10
Train loss: 0.04813286109196876  - Eval loss: 0.5636011402166332
7/10
Train loss: 0.04002485228552056  - Eval loss: 0.5906209253007546
8/10
Train loss: 0.032284249031993874  - Eval loss: 0.6135444167173572
9/10
Train loss: 0.02568569414220595  - Eval loss: 0.5857987442844509
10/10
Train loss: 0.026567676110280446  - Eval loss: 0.6454030108822432
Best epoch: 1 - Validation loss: 0.4199461299253421 [Run: 9]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

(0.44076107292332584, [0.54711, 0.12304, 0.65214], 0.7843450690938529)
Test Macros F1: ['pe', 'we', 'abam']: [0.8256970205277967, 0.8557962753536628, 0.7318342051312775] [Run: 9]
Total time: 310.0 minutes.


In [8]:
train_file_name = f'train-info-mtl-{weighting_strategy}-{model_name}.csv'
pd.DataFrame(training_info, columns = ['run', 'seed', 'weighting', 'epoch', 'train_loss', 'eval_loss', 'eval_f1']).to_csv(train_file_name, index = False)

test_file_name = f'test-info-mtl-{weighting_strategy}-{model_name}.csv'
pd.DataFrame(testing_results, columns = ['run', 'test', 'weighting', 'sequences', 'accuracy', 'macro-f1', 'O-f1', 'B-f1', 'I-f1', 'crf_number']).to_csv(test_file_name, index = False)

In [18]:
avg_cost, lambda_weight

(array([[0.77332, 0.7129 , 1.13553],
        [0.57892, 0.30982, 0.98344]], dtype=float32),
 array([[1., 1.],
        [1., 1.],
        [1., 1.]]))

In [9]:
import os
import zipfile

def zip_files(folder_path, zip_name):
    # Crear un archivo ZIP
    with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Recorrer todos los archivos en la carpeta
        for foldername, subfolders, filenames in os.walk(folder_path):
            for filename in filenames:
                # Comprobar si el archivo es un archivo TXT o CSV
                if filename.endswith('.txt') or filename.endswith('.csv') or filename.endswith('.pt'):
                    # Ruta completa del archivo
                    file_path = os.path.join(foldername, filename)
                    # Agregar el archivo al archivo ZIP
                    zipf.write(file_path, os.path.relpath(file_path, folder_path))

# Llamar a la función para comprimir los archivos
folder_path = '/kaggle/working/'
zip_name = 'archivos2.zip'
zip_files(folder_path, zip_name)