In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm
import json
import copy
import random
import time
import torch
from torch import nn, cuda, optim
from torch.utils.data import DataLoader
device = 'cuda' if cuda.is_available() else 'cpu'
if device == 'cuda':
    torch.cuda.empty_cache()
print(device)

try:
    from torchcrf import CRF
except:
    !pip install pytorch-crf
    from torchcrf import CRF

from transformers import (
    BertModel,
    BertForTokenClassification,
    BertTokenizerFast,
    AutoTokenizer,
    AutoModelForTokenClassification,
    get_linear_schedule_with_warmup
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    precision_recall_fscore_support,
    classification_report
)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings('ignore')

def generate_random_seed():
    return random.randint(1, 1000)

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

cpu
Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2
[0m



/kaggle/input/unit-segmentation-lstm-transformers/we.csv
/kaggle/input/unit-segmentation-lstm-transformers/mix1.csv
/kaggle/input/unit-segmentation-lstm-transformers/pe.csv
/kaggle/input/unit-segmentation-lstm-transformers/abam.csv
/kaggle/input/unit-segmentation-lstm-transformers/mix2.csv
/kaggle/input/unit-segmentation-lstm-transformers/ug.csv


In [2]:
configuration = {
    'train': 'mix1', 
    'test': ['pe', 'we', 'abam', 'ug', 'mix1'],
    'runs': 10,
    'epochs': 10,
    'train_batch_size': 32,
    'dev_batch_size': 32,
    'test_batch_size': 32,
    'label_list': ['O', 'B', 'I'],
    'model_checkpoint': 'bert-base-uncased',
    'crf': False,
    'lr': 1e-4
}

In [3]:
""" Tokenize examples in batch
Since the tokenizer may divide each token into two or more subtokens, we must align the new tokens with the original labels.
New subtokens must have the same label than their parent token
Labels may be 0, 1 or 2 for O, B and I labels, respectively, and -100 for complementary tokens, such PAD, SEP, CLS tokens.
Loss functions will ignore labels with value -100, so the loss only considers mistakes at the positions of real input (sub)tokens.
"""
def tokenize_and_align_labels(txts, lbls, tokenizer, max_len = 128, mapping = None):

    tokenized_inputs = tokenizer(txts, is_split_into_words=True,
                                 max_length = max_len, padding = 'max_length', truncation=True,
                                 return_tensors = 'pt')

    labels = []
    for i, label in enumerate(lbls):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        previous_label = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append('O')
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
                previous_label = label[word_idx]
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                new_label = 'O'
                if previous_label == 'O':
                    new_label = 'O'
                    # label_ids.append('O')
                else:
                    suffix_label = label[word_idx][1:]
                    new_label = 'I'+suffix_label
                label_ids.append(new_label)
                previous_label = new_label

            previous_word_idx = word_idx

        labels.append(label_ids)

    if mapping is not None:
        labels = [list(map(lambda x : mapping.index(x), x)) for x in labels]

    return tokenized_inputs, labels

"""
Return text tokens from the tokenizer given the numeric input ids
"""
def get_tokens_from_ids(input_ids):

    return [tokenizer.convert_ids_to_tokens(tl) for tl in input_ids]

"""
Remove part of predicted sequences corresponding to padding tokens.
"""
def remove_padding_from_predictions(predictions, batch_attention_mask):
    valid_predictions_list = []
    for instance_preds, att_mask in zip(predictions, batch_attention_mask):
        valid = [pred for pred, mask in zip(instance_preds, att_mask) if mask == 1]
        valid_predictions_list.append(valid[1:-1])
        
    return valid_predictions_list

def remove_padding_and_get_tokens(batch_ids, batch_attention_mask):
    valid_ids_list = []
    for instances_ids, att_mask in zip(batch_ids, batch_attention_mask):
        valid = [ids for ids, mask in zip(instances_ids, att_mask) if mask == 1]
        valid_ids_list.append(valid[1:-1])
    
    valid_tokens = get_tokens_from_ids(valid_ids_list)
    return valid_tokens

"""
Maps sequences of integer to sequences of BIO tags
"""
def integer_to_bio(labels, mapping):
    return [[mapping[int(x)] for x in l] for l in labels]

"""
Transforms list of predicted sequences to a flat list of labels.
"""
def flatten_predictions(labels):
    return [j for sub in labels for j in sub]

"""
Generates txt file with tokens, labels and predictions. Estilo FLAiR NLP.
"""
def generate_results_txt(tokens, labels, predictions, output_file_name):
    
    with open(output_file_name, 'w', encoding = 'utf-8') as nf:

        for tks, lbs, prds in zip(tokens, labels, predictions):
            for tk, lb, pr in zip(tks, lbs, prds):
                nf.write(f"{tk} {lb} {pr}\n")

            nf.write(f"\n")

"""
Dataset class for sequence labeling
"""
class SequenceLabelingDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, label_list):
        MAX_LEN = 128
        lb = [x.split() for x in df.labels.values.tolist()]
        txt = [i.split() for i in df.tokens.values.tolist()]
        self.encodings, self.labels = tokenize_and_align_labels(txt, 
                                                                lb, 
                                                                tokenizer, 
                                                                max_len = MAX_LEN, 
                                                                mapping = label_list)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
    
"""
Model for sequence labeling
"""
class SimpleTagger(nn.Module):
    def __init__(self, model_checkpoint, num_labels = 3):
        super(SimpleTagger, self).__init__()
        self.model_checkpoint = model_checkpoint
        self.num_labels = num_labels
        self.transf = AutoModelForTokenClassification.from_pretrained(self.model_checkpoint, num_labels = self.num_labels)
    
    def forward(self, input_ids, attention_mask, token_type_ids = None, labels = None):
        
        if labels is not None: # training
            outputs = None
            if token_type_ids is not None:
                outputs = self.transf(input_ids = input_ids, 
                                    token_type_ids = token_type_ids, 
                                    attention_mask = attention_mask, 
                                    labels = labels)
            else:
                outputs = self.transf(input_ids = input_ids, 
                                    attention_mask = attention_mask, 
                                    labels = labels)
            loss = outputs.loss
            logits = np.argmax(outputs.logits.detach().cpu().numpy(), axis = 2).tolist()
            return loss, logits
        else: # inference
            if token_type_ids is not None:
                outputs = self.transf(input_ids = input_ids, 
                                    token_type_ids = token_type_ids, 
                                    attention_mask = attention_mask)
            else:
                outputs = self.transf(input_ids = input_ids, 
                                    attention_mask = attention_mask)
#             outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
            return np.argmax(outputs.logits.detach().cpu().numpy(), axis = 2).tolist()

# """
# Model for sequence labeling - Longformer + CRF
# """
class TaggerWithCRF(nn.Module):
    def __init__(self, model_checkpoint, num_labels = 3):
        super(TaggerWithCRF, self).__init__()
        self.model_checkpoint = model_checkpoint
        self.num_labels = num_labels
        self.transf = AutoModelForTokenClassification.from_pretrained(self.model_checkpoint, num_labels = self.num_labels)
        self.crf = CRF(self.num_labels, batch_first = True)
        
    def forward(self, input_ids, attention_mask, token_type_ids = None, labels = None):
        outputs = None
        if token_type_ids is not None:
            outputs = self.transf(input_ids = input_ids, 
                                  token_type_ids = token_type_ids, 
                                  attention_mask = attention_mask)
        else:
            outputs = self.transf(input_ids = input_ids, 
                                  attention_mask = attention_mask)
        
        logits = outputs.logits
        
#         print(logits.shape)

        if labels is not None: # training
            loss = -self.crf(logits, labels, mask = attention_mask.byte(), reduction = 'token_mean')
            return loss, self.crf.decode(logits)
        else: # inference
            return self.crf.decode(logits)

In [4]:
def load_training_data(df_name, TRAIN_BATCH_SIZE = 32, DEV_BATCH_SIZE = 32):
    
    # TOKENIZER
    tokenizer_seq = AutoTokenizer.from_pretrained(model_checkpoint)
    
    # SEQUENCE LABELING DATASET
    df = pd.read_csv(f'/kaggle/input/unit-segmentation-lstm-transformers/{df_name}.csv')
    
    train_seq_df, dev_seq_df = None, None
    
    if 'dev' in df.set.unique():
        train_seq_df = df.loc[df['set'] == 'train']
        dev_seq_df = df.loc[df['set'] == 'dev']
    else:
        train_seq_df = df.loc[df['set'] == 'train']
        train_seq_df, dev_seq_df = train_test_split(train_seq_df, test_size = 0.1, random_state = 2023)
        
    train_seq_df = train_seq_df.sample(frac = 1)
    dev_seq_df = dev_seq_df.sample(frac = 1)
    
    print(train_seq_df.shape, dev_seq_df.shape)
    
    # PYTORCH DATASETS
    train_dataset = SequenceLabelingDataset(train_seq_df, tokenizer_seq, label_list)
    val_dataset = SequenceLabelingDataset(dev_seq_df, tokenizer_seq, label_list)
    
    # DATALOADERS
    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=DEV_BATCH_SIZE, shuffle=True)
    
    return train_loader, val_loader, tokenizer_seq
    
def load_testing_data(df_name, tokenizer, TEST_BATCH_SIZE = 32):

    # SEQUENCE LABELING DATASET
    df = pd.read_csv(f'/kaggle/input/unit-segmentation-lstm-transformers/{df_name}.csv')
    
    test_seq_df = df.loc[df['set'] == 'test']
    
    # PYTORCH DATASETS
    test_dataset = SequenceLabelingDataset(test_seq_df, tokenizer, label_list)
    
    # DATALOADERS
    test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=False)
    
    return test_loader

In [5]:
def train_model(model, train_loader, optimizer):

    # progress_bar
    # num_train_optimization_steps = len(train_loader)
#     progress_bar = tqdm(range(len(train_loader)))

    model.train()

    train_loss = 0
    for batch in train_loader:
        batch = tuple(v.to(device) for t, v in batch.items())
        loss, outputs = None, None
        
        if model_checkpoint.startswith('bert'):
            batch_input_ids, batch_token_type_ids, batch_attention_mask, batch_labels = batch
            loss, outputs = model(batch_input_ids, 
                                  token_type_ids = batch_token_type_ids,
                                  attention_mask = batch_attention_mask, 
                                  labels = batch_labels)
        else:
            batch_input_ids, batch_attention_mask, batch_labels = batch
            loss, outputs = model(batch_input_ids, 
                                  attention_mask = batch_attention_mask, 
                                  labels = batch_labels)

        

        train_loss += loss.item()

        # backprop
        optimizer.zero_grad()
        
        loss.backward()
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)

        optimizer.step()

#         progress_bar.update(1)

    avg_train_loss = train_loss / len(train_loader)
    return avg_train_loss

def evaluate_model(model, dataloader):

    model.eval()

    eval_loss = 0
    eval_labels, eval_predictions = [], []
    
    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(v.to(device) for t, v in batch.items())
            loss, outputs = None, None
            
            if model_checkpoint.startswith('bert'):
                batch_input_ids, batch_token_type_ids, batch_attention_mask, batch_labels = batch
                loss, outputs = model(batch_input_ids, 
                                      token_type_ids = batch_token_type_ids,
                                      attention_mask = batch_attention_mask, 
                                      labels = batch_labels)
            else:
                batch_input_ids, batch_attention_mask, batch_labels = batch
                loss, outputs = model(batch_input_ids, 
                                      attention_mask = batch_attention_mask, 
                                      labels = batch_labels)

            eval_loss += loss.item()
            
            valid_labels = remove_padding_from_predictions(batch_labels.detach().cpu().numpy(), batch_attention_mask.detach().cpu().numpy())
            eval_labels += valid_labels
            
            valid_predictions = remove_padding_from_predictions(outputs, batch_attention_mask.detach().cpu().numpy())
            eval_predictions += valid_predictions
    
    flattened_labels = flatten_predictions(eval_labels)
    flattened_predictions = flatten_predictions(eval_predictions)
    
    eval_f1 = f1_score(flattened_labels, flattened_predictions, average = 'macro')
    return eval_loss / len(dataloader), eval_f1


def test_model(model, dataloader, write_file = False, path_file = None):

    model.eval()

    eval_tokens, eval_labels, eval_predictions = [], [], []

    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(v.to(device) for t, v in batch.items())
            loss, outputs = None, None
            
            if model_checkpoint.startswith('bert'):
                batch_input_ids, batch_token_type_ids, batch_attention_mask, batch_labels = batch
                _, outputs = model(batch_input_ids, 
                                      token_type_ids = batch_token_type_ids,
                                      attention_mask = batch_attention_mask, 
                                      labels = batch_labels)
            else:
                batch_input_ids, batch_attention_mask, batch_labels = batch
                _, outputs = model(batch_input_ids, 
                                      attention_mask = batch_attention_mask, 
                                      labels = batch_labels)

            valid_labels = remove_padding_from_predictions(batch_labels.detach().cpu().numpy(), batch_attention_mask.detach().cpu().numpy())
            eval_labels += valid_labels
            
            valid_predictions = remove_padding_from_predictions(outputs, batch_attention_mask.detach().cpu().numpy())
            eval_predictions += valid_predictions
            
            valid_tokens = remove_padding_and_get_tokens(batch_input_ids.detach().cpu().numpy(), 
                                                         batch_attention_mask.detach().cpu().numpy())
            eval_tokens += valid_tokens
            
    if write_file:
        generate_results_txt(eval_tokens, eval_labels, eval_predictions, path_file)
    
    return eval_labels, eval_predictions

In [6]:
# MODEL CONFIGURATION
TAGGER_USING_CRF = configuration['crf']
model_checkpoint = configuration['model_checkpoint']
model_name = model_checkpoint.split('-')[0]
if TAGGER_USING_CRF:
    model_name += '-crf'
label_list = configuration['label_list']
num_labels = len(label_list)

# TRAINING CONFIGURATION
RUNS = configuration['runs']
EPOCHS = configuration['epochs']
TRAIN_BATCH_SIZE = configuration['train_batch_size']
TEST_BATCH_SIZE = configuration['test_batch_size']
DEV_BATCH_SIZE = configuration['dev_batch_size']
train_df = configuration['train']
test_dfs = configuration['test']

# MISCELLANEOUS
SAVE_INFORMATION = True
SAVE_BEST_MODEL = True

# ======================================== #
training_info, testing_info, models_info = [], [], []
start_time = time.time()
for nrun in range(RUNS):
    # Initialize
    rs = generate_random_seed()
    set_random_seed(rs)
    
    best_eval_loss = float('inf')
    best_epoch = 0
    best_model_state = None
    
    # Dataloaders
    train_loader, val_loader, tokenizer = load_training_data(train_df, TRAIN_BATCH_SIZE=TRAIN_BATCH_SIZE, DEV_BATCH_SIZE=DEV_BATCH_SIZE)
    
    # Create model
    tagger = None
    if TAGGER_USING_CRF:
        tagger = TaggerWithCRF(model_checkpoint, num_labels = num_labels)
    else:
        tagger = SimpleTagger(model_checkpoint, num_labels = num_labels)

    tagger.to(device)
    
    # OPTIMIZER
    optimizer = torch.optim.AdamW(tagger.parameters(), lr = configuration['lr'], eps = 1e-8)
    
    # Training loop
    for epoch in range(EPOCHS):
        print(f"{epoch+1}/{EPOCHS}")
        # train one epoch
        train_loss = train_model(tagger, train_loader, optimizer)
        
        # evaluate model
        eval_loss, eval_f1 = evaluate_model(tagger, val_loader)
        
        training_info.append((nrun, rs, epoch, train_loss, eval_loss, eval_f1)) # nrun, epoch, train_loss, eval_loss, eval_f1
        
        # save best model based on validation loss
        if eval_loss < best_eval_loss:
            best_eval_loss = eval_loss
            best_epoch = epoch
            best_model_state = copy.deepcopy(tagger.state_dict())
            
    print(f"Best epoch: {best_epoch} - Validation loss: {best_eval_loss} [Run: {nrun}]")
    
    # Testing
    # Loading best model
    best_tagger = None
    if TAGGER_USING_CRF: 
        best_tagger = TaggerWithCRF(model_checkpoint, num_labels = num_labels)
    else:
        best_tagger = SimpleTagger(model_checkpoint, num_labels = num_labels)
    
    best_tagger.load_state_dict(best_model_state)
    best_tagger.to(device)
    eval_results = evaluate_model(best_tagger, val_loader)
    print(eval_results) # check model
    
    macros = []
    for test_df in test_dfs:
        test_loader = load_testing_data(test_df, tokenizer, TEST_BATCH_SIZE = TEST_BATCH_SIZE)
        path_test_results_file = f'results-{train_df}-{test_df}-{model_name}-{nrun}.txt'
        tlabels, tpredictions = test_model(best_tagger, test_loader, write_file = True, path_file = path_test_results_file)
        
        flattened_labels = flatten_predictions(tlabels)
        flattened_predictions = flatten_predictions(tpredictions)
        report_info = classification_report(flattened_labels, flattened_predictions, target_names = label_list, output_dict = True)
        accuracy, macro_f1 = report_info['accuracy'], report_info['macro avg']['f1-score']
        o_f1, b_f1, i_f1 =  report_info['O']['f1-score'], report_info['B']['f1-score'], report_info['I']['f1-score']
        macros.append(macro_f1)
    
        testing_info.append((nrun, train_df, test_df, len(tpredictions), accuracy, macro_f1, o_f1, b_f1, i_f1)) # nrun, train, test, sequences, acc, macrof1, Of1, Bf1, If1
    
    print(f"Test Macros F1: {test_dfs}: {macros} [Run: {nrun}]")
    
    if SAVE_BEST_MODEL:
        model_path = f"model-{train_df}-{model_name}-{nrun}.pt"
        models_info.append((nrun, model_path))
        if best_model_state is not None:
            torch.save(best_model_state, model_path)
        else:
            torch.save(tagger.state_dict(), model_path)
    
# Save data    
if SAVE_INFORMATION:
    models_file_name = f"models-{train_df}-{model_name}.csv"
    pd.DataFrame(models_info, columns = ['run', 'model_file']).to_csv(models_file_name, index = False)
    
    train_file_name = f'train-info-{train_df}-{model_name}.csv'
    pd.DataFrame(training_info, columns = ['run', 'seed', 'epoch', 'train_loss', 'eval_loss', 'eval_f1']).to_csv(train_file_name, index = False)
    
    test_file_name = f'test-info-{train_df}-{model_name}.csv'
    pd.DataFrame(testing_info, columns = ['run', 'train', 'test', 'sequences', 'accuracy', 'macro-f1', 'O-f1', 'B-f1', 'I-f1']).to_csv(test_file_name, index = False)

print(f"Total time: {((time.time() - start_time)//60)+1} minutes.")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(9512, 5) (1057, 5)


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10
10/10
Best epoch: 0 - Validation loss: 0.06624407608829 [Run: 0]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

(0.06667538617244538, 0.791605593443887)
Test Macros F1: ['pe', 'we', 'abam', 'ug', 'mix1']: [0.8179544440324714, 0.7955431848724809, 0.7506768717946691, 0.34389489363066567, 0.7876492790919171] [Run: 0]
(9512, 5) (1057, 5)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10
10/10
Best epoch: 1 - Validation loss: 0.0671441684518119 [Run: 1]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

(0.06743207157534711, 0.8043540086471728)
Test Macros F1: ['pe', 'we', 'abam', 'ug', 'mix1']: [0.8357913948516447, 0.8004074768218441, 0.7686506430233013, 0.37216013151465904, 0.8124080967390258] [Run: 1]
(9512, 5) (1057, 5)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10
10/10
Best epoch: 0 - Validation loss: 0.07001978279474903 [Run: 2]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

(0.065122339874506, 0.7947726847853165)
Test Macros F1: ['pe', 'we', 'abam', 'ug', 'mix1']: [0.8355752274687965, 0.8153447154136938, 0.7534128344993952, 0.33830641202637196, 0.803676401016991] [Run: 2]
(9512, 5) (1057, 5)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10
10/10
Best epoch: 0 - Validation loss: 0.06663684667471577 [Run: 3]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

(0.07034964026773677, 0.7850084825608432)
Test Macros F1: ['pe', 'we', 'abam', 'ug', 'mix1']: [0.8330925798398096, 0.7608256884194621, 0.761029830542741, 0.37105860686517156, 0.7936323186651065] [Run: 3]
(9512, 5) (1057, 5)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10
10/10
Best epoch: 0 - Validation loss: 0.0648506529190961 [Run: 4]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

(0.06423328652539674, 0.7851011516323486)
Test Macros F1: ['pe', 'we', 'abam', 'ug', 'mix1']: [0.8188088512650502, 0.8230614326334083, 0.7529412843277393, 0.3206436354020677, 0.7984110237927325] [Run: 4]
(9512, 5) (1057, 5)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10
10/10
Best epoch: 1 - Validation loss: 0.067900034112801 [Run: 5]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

(0.06842481821556301, 0.8034078391103395)
Test Macros F1: ['pe', 'we', 'abam', 'ug', 'mix1']: [0.8285143673724806, 0.8446692433119529, 0.7515518579606085, 0.34836321517216357, 0.8097579002964747] [Run: 5]
(9512, 5) (1057, 5)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10
10/10
Best epoch: 0 - Validation loss: 0.07142499658991308 [Run: 6]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

(0.06869065016508102, 0.772843745856278)
Test Macros F1: ['pe', 'we', 'abam', 'ug', 'mix1']: [0.808666020281927, 0.8392581593855785, 0.7277282250001882, 0.35926670097949387, 0.7875332760319315] [Run: 6]
(9512, 5) (1057, 5)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10
10/10
Best epoch: 0 - Validation loss: 0.06650329151136034 [Run: 7]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

(0.0660928628195132, 0.7918143867658599)
Test Macros F1: ['pe', 'we', 'abam', 'ug', 'mix1']: [0.8458001827358, 0.7705357760879418, 0.7662125023014617, 0.3690619158147617, 0.804535010467656] [Run: 7]
(9512, 5) (1057, 5)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10
10/10
Best epoch: 0 - Validation loss: 0.07464511550086386 [Run: 8]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

(0.06665432212107322, 0.7834977438737813)
Test Macros F1: ['pe', 'we', 'abam', 'ug', 'mix1']: [0.8264350889591148, 0.7653256641748918, 0.7446498261402393, 0.3752323415483974, 0.7888035214366732] [Run: 8]
(9512, 5) (1057, 5)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10
10/10
Best epoch: 1 - Validation loss: 0.06907985107082983 [Run: 9]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

(0.06910185283049941, 0.7956322222417445)
Test Macros F1: ['pe', 'we', 'abam', 'ug', 'mix1']: [0.8378480651606516, 0.8544485903899175, 0.7359409612234321, 0.3464948903620755, 0.8100520780848605] [Run: 9]
Total time: 198.0 minutes.


In [7]:
import os
import zipfile

def zip_files(folder_path, zip_name):
    # Crear un archivo ZIP
    with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Recorrer todos los archivos en la carpeta
        for foldername, subfolders, filenames in os.walk(folder_path):
            for filename in filenames:
                # Comprobar si el archivo es un archivo TXT o CSV
                if filename.endswith('.txt') or filename.endswith('.csv') or filename.endswith('.pt'):
                    # Ruta completa del archivo
                    file_path = os.path.join(foldername, filename)
                    # Agregar el archivo al archivo ZIP
                    zipf.write(file_path, os.path.relpath(file_path, folder_path))

# Llamar a la función para comprimir los archivos
folder_path = '/kaggle/working/'
zip_name = 'archivos2.zip'
zip_files(folder_path, zip_name)

In [8]:
# best_eval_loss = float('inf')
# best_epoch = 0
# best_model_state = None

# PATH = None

# training_info = []

# # DATALOADERS
# train_loader, val_loader, tokenizer = load_training_data(train_df, TRAIN_BATCH_SIZE=TRAIN_BATCH_SIZE, DEV_BATCH_SIZE=DEV_BATCH_SIZE)

# # TAGGER
# tagger = None
# if TAGGER_USING_CRF:
#     tagger = TaggerWithCRF(model_checkpoint, num_labels = num_labels)
# else:
#     tagger = SimpleTagger(model_checkpoint, num_labels = num_labels)
    
# tagger.to(device)

# # OPTIMIZER
# optimizer = torch.optim.AdamW(tagger.parameters(), lr = 1e-4, eps = 1e-8)

# # TRAINING LOOP
# for epoch in range(EPOCHS):
    
#     # train one epoch
#     train_loss = train_model(tagger, train_loader, optimizer)

#     # evaluate model
#     eval_results = evaluate_model(tagger, val_loader)

#     info_dict = {'epoch': epoch, 'train_loss': train_loss, 'eval_loss': eval_results[0], 'eval_f1': eval_results[1]}
    
#     training_info.append(tuple(info_dict.values()))

#     # save best model based on validation loss
#     eval_loss = eval_results[0]
#     if eval_loss < best_eval_loss:
#         best_eval_loss = eval_loss
#         best_epoch = epoch
#         best_model_state = copy.deepcopy(tagger.state_dict())
            
#     print(info_dict)

# print(best_eval_loss, "in epoch", best_epoch)

# df_cols = ['epoch', 'train_loss', 'eval_loss', 'eval_f1']
# info_df = pd.DataFrame(training_info, columns = df_cols)
# if SAVE_INFORMATION:
#     file_name = f'train-info-{train_df}-{model_name}-crf.csv' if TAGGER_USING_CRF else f'train-info-{train_df}-{model_name}-nocrf.csv'
#     info_df.to_csv(file_name, index = False)

# if SAVE_BEST_MODEL:
#     PATH = f"model-{train_df}-{model_name}-crf.pt" if TAGGER_USING_CRF else f"model-{train_df}-{model_name}-nocrf.pt"
    
#     if best_model_state is not None:
#         torch.save(best_model_state, PATH)
#     else:
#         torch.save(tagger.state_dict(), PATH)

In [9]:
# # Loading best model
# if TAGGER_USING_CRF:
#     best_tagger = TaggerWithCRF(model_checkpoint, num_labels = num_labels)
# else:
#     best_tagger = SimpleTagger(model_checkpoint, num_labels = num_labels)

# # best_tagger.load_state_dict(best_model_state)
# # if PATH is not None:
# best_tagger.load_state_dict(torch.load(PATH))
# best_tagger.to(device)

# eval_results = evaluate_model(best_tagger, val_loader)

# print(eval_results)
# print("-"*30)
# print()

# # Evaluation on testing sets.
# testing_info = []

# for test_df in test_dfs:
    
#     run_info = [train_df, test_df]
    
#     print(f"MODEL TRAINED USING {train_df}\nEVALUATED IN {test_df}")
    
#     test_loader = load_testing_data(test_df, tokenizer, TEST_BATCH_SIZE = 32)
    
#     path_results_file = f'results-{train_df}-{test_df}-{model_name}-crf.txt' if TAGGER_USING_CRF else f'results-{train_df}-{test_df}-{model_name}-nocrf.txt'
#     tlabels, tpredictions = test_model(best_tagger, test_loader, write_file = True, path_file = path_results_file)
    
#     print(f"Total sequences in predictions: {len(tpredictions)}")

#     flattened_labels = flatten_predictions(tlabels)
#     flattened_predictions = flatten_predictions(tpredictions)
    
#     report_info = classification_report(flattened_labels, flattened_predictions, target_names = label_list, output_dict = True)
    
#     run_info += [len(tpredictions), report_info['accuracy'], report_info['macro avg']['f1-score']]
    
#     for lb in label_list:
#         run_info.append(report_info[lb]['f1-score'])
    
#     print(classification_report(flattened_labels, flattened_predictions, target_names = label_list))
#     print()
    
#     testing_info.append(run_info)
    
# #     break

# info_df = pd.DataFrame(testing_info, 
#                        columns = ['training', 'testing', 'sequences', 'accuracy', 'macro-f1', 'O-f1', 'b-f1', 'I-f1'])
# if SAVE_INFORMATION:
#     test_file_name = f'test-info-{train_df}-{model_name}-crf.csv' if TAGGER_USING_CRF else f'test-info-{train_df}-{model_name}-nocrf.csv'
#     info_df.to_csv(test_file_name, index = False)