# Fine-Tuning & Evaluation LaBSE with 5-fold cross validation

##### Notebook was executed in Google Colab

In [None]:
!pip install datasets transformers seqeval -q

[K     |████████████████████████████████| 245kB 26.7MB/s 
[K     |████████████████████████████████| 2.5MB 49.9MB/s 
[K     |████████████████████████████████| 51kB 8.5MB/s 
[K     |████████████████████████████████| 245kB 51.1MB/s 
[K     |████████████████████████████████| 122kB 58.9MB/s 
[K     |████████████████████████████████| 3.3MB 51.7MB/s 
[K     |████████████████████████████████| 901kB 54.0MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys

# Insert utils folder into path
sys.path.insert(1, '/content/drive/MyDrive/Thesis/utils')

## Training / Fine-tuning Process

In [None]:
task = "ner"
model_checkpoint = "sentence-transformers/LaBSE" # LaBSE pre-trained from HuggingFace Hub
batch_size = 16

### Loading the dataset

In [None]:
from datasets import load_dataset, load_metric, concatenate_datasets

datasets = load_dataset("conll2003")

all_data = concatenate_datasets([datasets['train'], datasets['validation'], datasets['test']])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2603.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1781.0, style=ProgressStyle(description…


Downloading and preparing dataset conll2003/conll2003 (download: 4.63 MiB, generated: 9.78 MiB, post-processed: Unknown size, total: 14.41 MiB) to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=649539.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=162714.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=145897.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6. Subsequent calls will reuse this data.


In [None]:
label_list = all_data.features[f"{task}_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

### Processing the data

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=804.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5220781.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9621556.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




In [None]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_dataset = all_data.map(tokenize_and_align_labels, batched=True)

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [None]:
from sklearn.model_selection import KFold

In [None]:
n = 5
seed = 40
kf = KFold(n_splits=n, random_state=seed, shuffle=True)

In [None]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
        "LOC-f1": results['LOC']["f1"],
        "LOC-precision": results['LOC']["precision"],
        "LOC-recall": results['LOC']["recall"],
    }

In [None]:
from cross_validation_LaBSE import cross_validation_LaBSE


In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import EarlyStoppingCallback

fold = 1

args = TrainingArguments(
    output_dir=f"fold-{fold}-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="f1",
    greater_is_better=True,
)

metric = load_metric("seqeval")
data_collator = DataCollatorForTokenClassification(tokenizer)

for train_index, eval_index in kf.split(tokenized_dataset):
    
    train_data = tokenized_dataset.select(train_index)
    eval_data = tokenized_dataset.select(eval_index)
    
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
    
    trainer = Trainer(model,
                      args,
                      train_dataset=train_data,
                      eval_dataset=eval_data,
                      data_collator=data_collator,
                      tokenizer=tokenizer,
                      compute_metrics=compute_metrics)
    
    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))
    
    trainer.train()
    
    cross_validation_LaBSE(model, tokenizer, label_list, fold)
    
    fold += 1


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2482.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1883785969.0, style=ProgressStyle(descr…




Some weights of BertForTokenClassification were not initialized from the model checkpoint at sentence-transformers/LaBSE and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, ner_tags, id, tokens, pos_tags.
***** Running training *****
  Num examples = 16595
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5190


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall
1,0.062,0.059483,0.93509,0.931061,0.933071,0.985804,0.954386,0.942065,0.967033
2,0.0336,0.051351,0.937913,0.950306,0.944069,0.988267,0.965729,0.966041,0.965417
3,0.0187,0.05058,0.94504,0.953275,0.94914,0.989319,0.967335,0.967961,0.96671
4,0.0147,0.052494,0.945887,0.955668,0.950753,0.989519,0.968264,0.970149,0.966387
5,0.0097,0.05415,0.94639,0.954998,0.950674,0.989466,0.968387,0.966516,0.970265


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, ner_tags, id, tokens, pos_tags.
***** Running Evaluation *****
  Num examples = 4149
  Batch size = 16
Saving model checkpoint to fold-1-ner/checkpoint-1038
Configuration saved in fold-1-ner/checkpoint-1038/config.json
Model weights saved in fold-1-ner/checkpoint-1038/pytorch_model.bin
tokenizer config file saved in fold-1-ner/checkpoint-1038/tokenizer_config.json
Special tokens file saved in fold-1-ner/checkpoint-1038/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, ner_tags, id, tokens, pos_tags.
***** Running Evaluation *****
  Num examples = 4149
  Batch size = 16
Saving model checkpoint to fold-1-ner/checkpoint-2076
Configuration saved in fold-1-ner/checkpoint-2076/config.json
Model weights saved in

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 1044
  Batch size = 8





Evaluation mode: strict
fp: 1833 | tp: 4007 | fn: 1536
precision: 0.686 | recall: 0.723 | f-score: 0.704 | accuracy: 0.723
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 258 | tp: 5582 | fn: 525
precision: 0.956 | recall: 0.914 | f-score: 0.934 | accuracy: 1.007
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 174
  Batch size = 8





Evaluation mode: strict
fp: 282 | tp: 963 | fn: 355
precision: 0.773 | recall: 0.731 | f-score: 0.751 | accuracy: 0.731
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 74 | tp: 1171 | fn: 266
precision: 0.941 | recall: 0.815 | f-score: 0.873 | accuracy: 0.888
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 887
  Batch size = 8





Evaluation mode: strict
fp: 1665 | tp: 3222 | fn: 1866
precision: 0.659 | recall: 0.633 | f-score: 0.646 | accuracy: 0.633
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 772 | tp: 4115 | fn: 1374
precision: 0.842 | recall: 0.750 | f-score: 0.793 | accuracy: 0.809
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 346
  Batch size = 8





Evaluation mode: strict
fp: 478 | tp: 1601 | fn: 997
precision: 0.770 | recall: 0.616 | f-score: 0.685 | accuracy: 0.616
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 79 | tp: 2000 | fn: 837
precision: 0.962 | recall: 0.705 | f-score: 0.814 | accuracy: 0.770
------------------------------------------------------------------------



loading configuration file https://huggingface.co/sentence-transformers/LaBSE/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/0dacea19c849b8846d621bf58a1aaeabc1d0d7cf63e337c0539b22e068672217.8c37167d9ff226182e8d39b551a1ee5d071e918f35157e07977ee19a100d6182
Model config BertConfig {
  "_name_or_path": "old_models/LaBSE/0_Transformer",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall
1,0.0663,0.04783,0.935349,0.938533,0.936939,0.98762,0.957457,0.95302,0.961935
2,0.0366,0.043191,0.947804,0.946606,0.947205,0.989326,0.963734,0.971794,0.955806
3,0.0218,0.040542,0.950716,0.95497,0.952838,0.990398,0.966845,0.96951,0.964194
4,0.0147,0.044459,0.949371,0.953803,0.951582,0.990331,0.966941,0.966785,0.967097
5,0.0089,0.044103,0.952256,0.956331,0.95429,0.990821,0.969589,0.972421,0.966774


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, ner_tags, id, tokens, pos_tags.
***** Running Evaluation *****
  Num examples = 4149
  Batch size = 16
Saving model checkpoint to fold-1-ner/checkpoint-1038
Configuration saved in fold-1-ner/checkpoint-1038/config.json
Model weights saved in fold-1-ner/checkpoint-1038/pytorch_model.bin
tokenizer config file saved in fold-1-ner/checkpoint-1038/tokenizer_config.json
Special tokens file saved in fold-1-ner/checkpoint-1038/special_tokens_map.json
Deleting older checkpoint [fold-1-ner/checkpoint-4152] due to args.save_total_limit
Deleting older checkpoint [fold-1-ner/checkpoint-5190] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, ner_tags, id, tokens, pos_tags.
***** Running Evaluation *****
  Num exam

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 1044
  Batch size = 8





Evaluation mode: strict
fp: 1666 | tp: 3962 | fn: 1581
precision: 0.704 | recall: 0.715 | f-score: 0.709 | accuracy: 0.715
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 243 | tp: 5385 | fn: 724
precision: 0.957 | recall: 0.881 | f-score: 0.918 | accuracy: 0.971
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 174
  Batch size = 8





Evaluation mode: strict
fp: 289 | tp: 958 | fn: 360
precision: 0.768 | recall: 0.727 | f-score: 0.747 | accuracy: 0.727
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 73 | tp: 1174 | fn: 267
precision: 0.941 | recall: 0.815 | f-score: 0.874 | accuracy: 0.891
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 887
  Batch size = 8





Evaluation mode: strict
fp: 1680 | tp: 3284 | fn: 1804
precision: 0.662 | recall: 0.645 | f-score: 0.653 | accuracy: 0.645
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 712 | tp: 4252 | fn: 1280
precision: 0.857 | recall: 0.769 | f-score: 0.810 | accuracy: 0.836
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 346
  Batch size = 8





Evaluation mode: strict
fp: 479 | tp: 1590 | fn: 1008
precision: 0.768 | recall: 0.612 | f-score: 0.681 | accuracy: 0.612
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 70 | tp: 1999 | fn: 843
precision: 0.966 | recall: 0.703 | f-score: 0.814 | accuracy: 0.769
------------------------------------------------------------------------



loading configuration file https://huggingface.co/sentence-transformers/LaBSE/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/0dacea19c849b8846d621bf58a1aaeabc1d0d7cf63e337c0539b22e068672217.8c37167d9ff226182e8d39b551a1ee5d071e918f35157e07977ee19a100d6182
Model config BertConfig {
  "_name_or_path": "old_models/LaBSE/0_Transformer",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall
1,0.0669,0.049602,0.935198,0.942553,0.938861,0.987778,0.962067,0.96459,0.959556
2,0.0391,0.042401,0.9456,0.955212,0.950382,0.989833,0.965673,0.954444,0.977169
3,0.0243,0.042832,0.947912,0.959912,0.953874,0.9905,0.97005,0.963034,0.977169
4,0.0147,0.043257,0.953852,0.959432,0.956634,0.990927,0.969432,0.961501,0.977495
5,0.0083,0.044547,0.95507,0.960199,0.957628,0.991234,0.970011,0.964228,0.975864


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, ner_tags, id, tokens, pos_tags.
***** Running Evaluation *****
  Num examples = 4149
  Batch size = 16
Saving model checkpoint to fold-1-ner/checkpoint-1038
Configuration saved in fold-1-ner/checkpoint-1038/config.json
Model weights saved in fold-1-ner/checkpoint-1038/pytorch_model.bin
tokenizer config file saved in fold-1-ner/checkpoint-1038/tokenizer_config.json
Special tokens file saved in fold-1-ner/checkpoint-1038/special_tokens_map.json
Deleting older checkpoint [fold-1-ner/checkpoint-5190] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, ner_tags, id, tokens, pos_tags.
***** Running Evaluation *****
  Num examples = 4149
  Batch size = 16
Saving model checkpoint to fold-1-ner/checkpoint-2076


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 1044
  Batch size = 8





Evaluation mode: strict
fp: 1887 | tp: 4158 | fn: 1385
precision: 0.688 | recall: 0.750 | f-score: 0.718 | accuracy: 0.750
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 331 | tp: 5714 | fn: 435
precision: 0.945 | recall: 0.929 | f-score: 0.937 | accuracy: 1.031
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 174
  Batch size = 8





Evaluation mode: strict
fp: 289 | tp: 961 | fn: 357
precision: 0.769 | recall: 0.729 | f-score: 0.748 | accuracy: 0.729
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 84 | tp: 1166 | fn: 264
precision: 0.933 | recall: 0.815 | f-score: 0.870 | accuracy: 0.885
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 887
  Batch size = 8





Evaluation mode: strict
fp: 1801 | tp: 3359 | fn: 1729
precision: 0.651 | recall: 0.660 | f-score: 0.656 | accuracy: 0.660
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 828 | tp: 4332 | fn: 1202
precision: 0.840 | recall: 0.783 | f-score: 0.810 | accuracy: 0.851
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 346
  Batch size = 8





Evaluation mode: strict
fp: 487 | tp: 1593 | fn: 1005
precision: 0.766 | recall: 0.613 | f-score: 0.681 | accuracy: 0.613
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 80 | tp: 2000 | fn: 842
precision: 0.962 | recall: 0.704 | f-score: 0.813 | accuracy: 0.770
------------------------------------------------------------------------



loading configuration file https://huggingface.co/sentence-transformers/LaBSE/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/0dacea19c849b8846d621bf58a1aaeabc1d0d7cf63e337c0539b22e068672217.8c37167d9ff226182e8d39b551a1ee5d071e918f35157e07977ee19a100d6182
Model config BertConfig {
  "_name_or_path": "old_models/LaBSE/0_Transformer",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall
1,0.0635,0.05601,0.931204,0.930756,0.93098,0.984831,0.949992,0.971744,0.929193
2,0.0359,0.048683,0.940217,0.942297,0.941256,0.987867,0.962486,0.973024,0.952174
3,0.0218,0.047192,0.948215,0.952683,0.950444,0.989547,0.96741,0.9715,0.963354
4,0.0136,0.04868,0.950154,0.949606,0.94988,0.989533,0.966755,0.97176,0.961801
5,0.0083,0.050133,0.949112,0.950664,0.949887,0.989708,0.966474,0.970561,0.962422


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, ner_tags, id, tokens, pos_tags.
***** Running Evaluation *****
  Num examples = 4149
  Batch size = 16
Saving model checkpoint to fold-1-ner/checkpoint-1038
Configuration saved in fold-1-ner/checkpoint-1038/config.json
Model weights saved in fold-1-ner/checkpoint-1038/pytorch_model.bin
tokenizer config file saved in fold-1-ner/checkpoint-1038/tokenizer_config.json
Special tokens file saved in fold-1-ner/checkpoint-1038/special_tokens_map.json
Deleting older checkpoint [fold-1-ner/checkpoint-5190] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, ner_tags, id, tokens, pos_tags.
***** Running Evaluation *****
  Num examples = 4149
  Batch size = 16
Saving model checkpoint to fold-1-ner/checkpoint-2076


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 1044
  Batch size = 8





Evaluation mode: strict
fp: 1921 | tp: 3918 | fn: 1625
precision: 0.671 | recall: 0.707 | f-score: 0.688 | accuracy: 0.707
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 291 | tp: 5548 | fn: 564
precision: 0.950 | recall: 0.908 | f-score: 0.928 | accuracy: 1.001
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 174
  Batch size = 8





Evaluation mode: strict
fp: 285 | tp: 964 | fn: 354
precision: 0.772 | recall: 0.731 | f-score: 0.751 | accuracy: 0.731
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 73 | tp: 1176 | fn: 263
precision: 0.942 | recall: 0.817 | f-score: 0.875 | accuracy: 0.892
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 887
  Batch size = 8





Evaluation mode: strict
fp: 1634 | tp: 3207 | fn: 1881
precision: 0.662 | recall: 0.630 | f-score: 0.646 | accuracy: 0.630
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 690 | tp: 4151 | fn: 1357
precision: 0.857 | recall: 0.754 | f-score: 0.802 | accuracy: 0.816
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 346
  Batch size = 8





Evaluation mode: strict
fp: 467 | tp: 1588 | fn: 1010
precision: 0.773 | recall: 0.611 | f-score: 0.683 | accuracy: 0.611
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 75 | tp: 1980 | fn: 860
precision: 0.964 | recall: 0.697 | f-score: 0.809 | accuracy: 0.762
------------------------------------------------------------------------



loading configuration file https://huggingface.co/sentence-transformers/LaBSE/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/0dacea19c849b8846d621bf58a1aaeabc1d0d7cf63e337c0539b22e068672217.8c37167d9ff226182e8d39b551a1ee5d071e918f35157e07977ee19a100d6182
Model config BertConfig {
  "_name_or_path": "old_models/LaBSE/0_Transformer",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall
1,0.062,0.054492,0.945629,0.937417,0.941505,0.98695,0.963251,0.964152,0.962352
2,0.037,0.053077,0.950194,0.945441,0.947812,0.988212,0.968365,0.970028,0.966708


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, ner_tags, id, tokens, pos_tags.
***** Running Evaluation *****
  Num examples = 4148
  Batch size = 16
Saving model checkpoint to fold-1-ner/checkpoint-1038
Configuration saved in fold-1-ner/checkpoint-1038/config.json
Model weights saved in fold-1-ner/checkpoint-1038/pytorch_model.bin
tokenizer config file saved in fold-1-ner/checkpoint-1038/tokenizer_config.json
Special tokens file saved in fold-1-ner/checkpoint-1038/special_tokens_map.json
Deleting older checkpoint [fold-1-ner/checkpoint-3114] due to args.save_total_limit
Deleting older checkpoint [fold-1-ner/checkpoint-5190] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, ner_tags, id, tokens, pos_tags.
***** Running Evaluation *****
  Num exam

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall
1,0.062,0.054492,0.945629,0.937417,0.941505,0.98695,0.963251,0.964152,0.962352
2,0.037,0.053077,0.950194,0.945441,0.947812,0.988212,0.968365,0.970028,0.966708


In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import EarlyStoppingCallback

fold = 1

args = TrainingArguments(
    output_dir=f"fold-{fold}-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="f1",
    greater_is_better=True,
)

metric = load_metric("seqeval")
data_collator = DataCollatorForTokenClassification(tokenizer)

for train_index, eval_index in kf.split(tokenized_dataset):
    print(fold)
    if fold == 5:
      train_data = tokenized_dataset.select(train_index)
      eval_data = tokenized_dataset.select(eval_index)
      
      model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
      
      trainer = Trainer(model,
                        args,
                        train_dataset=train_data,
                        eval_dataset=eval_data,
                        data_collator=data_collator,
                        tokenizer=tokenizer,
                        compute_metrics=compute_metrics)
      
      trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))
      
      trainer.train()
      
      cross_validation_LaBSE(model, tokenizer, label_list, fold)
    
    fold += 1


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2482.0, style=ProgressStyle(description…


1
2
3
4
5


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1883785969.0, style=ProgressStyle(descr…




Some weights of BertForTokenClassification were not initialized from the model checkpoint at sentence-transformers/LaBSE and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, tokens, ner_tags, id, pos_tags.
***** Running training *****
  Num examples = 16596
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5190


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall
1,0.0626,0.053931,0.945672,0.941476,0.943569,0.987123,0.966432,0.969915,0.962974
2,0.0365,0.052465,0.951497,0.948084,0.949787,0.988731,0.969669,0.969518,0.96982
3,0.0189,0.049113,0.955988,0.955446,0.955717,0.989648,0.970657,0.968702,0.97262
4,0.0126,0.052573,0.955562,0.956013,0.955787,0.989794,0.969193,0.959732,0.978843
5,0.0092,0.05328,0.959883,0.959883,0.959883,0.990525,0.972395,0.969388,0.97542


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, tokens, ner_tags, id, pos_tags.
***** Running Evaluation *****
  Num examples = 4148
  Batch size = 16
Saving model checkpoint to fold-1-ner/checkpoint-1038
Configuration saved in fold-1-ner/checkpoint-1038/config.json
Model weights saved in fold-1-ner/checkpoint-1038/pytorch_model.bin
tokenizer config file saved in fold-1-ner/checkpoint-1038/tokenizer_config.json
Special tokens file saved in fold-1-ner/checkpoint-1038/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, tokens, ner_tags, id, pos_tags.
***** Running Evaluation *****
  Num examples = 4148
  Batch size = 16
Saving model checkpoint to fold-1-ner/checkpoint-2076
Configuration saved in fold-1-ner/checkpoint-2076/config.json
Model weights saved in

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 1044
  Batch size = 8





Evaluation mode: strict
fp: 1830 | tp: 4059 | fn: 1484
precision: 0.689 | recall: 0.732 | f-score: 0.710 | accuracy: 0.732
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 291 | tp: 5598 | fn: 542
precision: 0.951 | recall: 0.912 | f-score: 0.931 | accuracy: 1.010
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 174
  Batch size = 8





Evaluation mode: strict
fp: 305 | tp: 975 | fn: 343
precision: 0.762 | recall: 0.740 | f-score: 0.751 | accuracy: 0.740
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 92 | tp: 1188 | fn: 254
precision: 0.928 | recall: 0.824 | f-score: 0.873 | accuracy: 0.901
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 887
  Batch size = 8





Evaluation mode: strict
fp: 1699 | tp: 3359 | fn: 1729
precision: 0.664 | recall: 0.660 | f-score: 0.662 | accuracy: 0.660
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 752 | tp: 4306 | fn: 1219
precision: 0.851 | recall: 0.779 | f-score: 0.814 | accuracy: 0.846
------------------------------------------------------------------------



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens.
***** Running Prediction *****
  Num examples = 346
  Batch size = 8





Evaluation mode: strict
fp: 498 | tp: 1605 | fn: 993
precision: 0.763 | recall: 0.618 | f-score: 0.683 | accuracy: 0.618
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 93 | tp: 2010 | fn: 835
precision: 0.956 | recall: 0.707 | f-score: 0.812 | accuracy: 0.774
------------------------------------------------------------------------

