In [1]:
import sys

# Insert utils folder into path
sys.path.insert(1, '../utils')

## Training / Fine-tuning Process

In [2]:
task = "ner"
model_checkpoint = "bert-base-multilingual-cased" # mBERT pre-trained from HuggingFace Hub
batch_size = 16

### Loading the dataset

In [3]:
from datasets import load_dataset, load_metric, concatenate_datasets

datasets = load_dataset("conll2003")

all_data = concatenate_datasets([datasets['train'], datasets['validation'], datasets['test']])

Reusing dataset conll2003 (C:\Users\Bernard\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


In [4]:
label_list = all_data.features[f"{task}_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

### Processing the data

In [5]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [7]:
tokenized_dataset = all_data.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/21 [00:00<?, ?ba/s]

In [8]:
from sklearn.model_selection import KFold

In [9]:
n = 5
seed = 40
kf = KFold(n_splits=n, random_state=seed, shuffle=True)

In [10]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
        "LOC-f1": results['LOC']["f1"],
        "LOC-precision": results['LOC']["precision"],
        "LOC-recall": results['LOC']["recall"],
    }

In [11]:
from cross_validation_mBERT import cross_validation_mBERT

In [12]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import EarlyStoppingCallback

fold = 1

args = TrainingArguments(
    output_dir=f"fold-{fold}-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

metric = load_metric("seqeval")
data_collator = DataCollatorForTokenClassification(tokenizer)

for train_index, eval_index in kf.split(tokenized_dataset):
    
    train_data = tokenized_dataset.select(train_index)
    eval_data = tokenized_dataset.select(eval_index)
    
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
    
    trainer = Trainer(model,
                      args,
                      train_dataset=train_data,
                      eval_dataset=eval_data,
                      data_collator=data_collator,
                      tokenizer=tokenizer,
                      compute_metrics=compute_metrics)
    
    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))
    
    trainer.train()
    
    cross_validation_mBERT(model, tokenizer, label_list, fold)
    
    fold += 1


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall,Runtime,Samples Per Second
1,0.0844,0.084179,0.899461,0.921194,0.910198,0.979094,0.941332,0.936762,0.945946,6.3694,651.397
2,0.0515,0.079931,0.922732,0.939195,0.930891,0.982944,0.958111,0.964804,0.95151,6.2848,660.163
3,0.0316,0.071331,0.937112,0.943035,0.940065,0.985363,0.958131,0.949285,0.967144,6.3586,652.506
4,0.0223,0.076697,0.93581,0.941275,0.938535,0.98503,0.96249,0.966346,0.958665,6.3625,652.105
5,0.015,0.086339,0.934252,0.943595,0.938901,0.985092,0.955003,0.953993,0.956015,6.3998,648.299
6,0.0102,0.087011,0.936207,0.945196,0.94068,0.985178,0.956774,0.951757,0.961844,6.2861,660.023
7,0.0079,0.091336,0.937554,0.946556,0.942034,0.985709,0.960649,0.963981,0.95734,6.3743,650.891
8,0.0036,0.091812,0.937406,0.947756,0.942553,0.985635,0.959616,0.965406,0.953895,6.4215,646.107
9,0.003,0.098552,0.938524,0.949036,0.94375,0.986067,0.962234,0.96583,0.958665,6.4676,641.509
10,0.0018,0.097202,0.940241,0.950396,0.945291,0.986449,0.960404,0.96322,0.957605,6.2988,658.699


  0%|          | 0/2 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 1523 | tp: 3872 | fn: 1671
precision: 0.718 | recall: 0.699 | f-score: 0.708 | accuracy: 0.699
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 238 | tp: 5157 | fn: 992
precision: 0.956 | recall: 0.839 | f-score: 0.893 | accuracy: 0.930
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 309 | tp: 934 | fn: 384
precision: 0.751 | recall: 0.709 | f-score: 0.729 | accuracy: 0.709
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 98 | tp: 1145 | fn: 295
precision: 0.921 | recall: 0.795 | f-score: 0.854 | accuracy: 0.869
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 1694 | tp: 3154 | fn: 1934
precision: 0.651 | recall: 0.620 | f-score: 0.635 | accuracy: 0.620
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 763 | tp: 4085 | fn: 1427
precision: 0.843 | recall: 0.741 | f-score: 0.789 | accuracy: 0.803
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 481 | tp: 1552 | fn: 1046
precision: 0.763 | recall: 0.597 | f-score: 0.670 | accuracy: 0.597
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 101 | tp: 1932 | fn: 908
precision: 0.950 | recall: 0.680 | f-score: 0.793 | accuracy: 0.744
------------------------------------------------------------------------



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall,Runtime,Samples Per Second
1,0.0856,0.070064,0.92234,0.920918,0.921628,0.981305,0.947648,0.950173,0.945136,6.282,660.457
2,0.051,0.057977,0.935153,0.939249,0.937197,0.985413,0.956059,0.960664,0.951497,6.4596,642.295
3,0.0368,0.057734,0.938172,0.947684,0.942904,0.986053,0.954356,0.939409,0.969785,6.3503,653.357
4,0.0231,0.060254,0.946844,0.947765,0.947304,0.987393,0.962297,0.960644,0.963954,6.6211,626.637
5,0.0151,0.064711,0.949638,0.946711,0.948172,0.987762,0.96251,0.965591,0.959449,6.4201,646.252
6,0.0098,0.073354,0.945052,0.952794,0.948907,0.987688,0.960474,0.954938,0.966075,6.388,649.499
7,0.0076,0.070025,0.946309,0.953524,0.949903,0.987775,0.96354,0.960495,0.966605,6.334,655.033
8,0.0056,0.074419,0.950469,0.952551,0.951509,0.988377,0.964656,0.967226,0.962099,6.3943,648.861
9,0.0038,0.075889,0.949273,0.9532,0.951232,0.988254,0.963817,0.963945,0.963689,6.413,646.967
10,0.0026,0.077334,0.950231,0.952389,0.951308,0.9885,0.963689,0.963689,0.963689,6.3447,653.927


  0%|          | 0/2 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 1562 | tp: 3832 | fn: 1711
precision: 0.710 | recall: 0.691 | f-score: 0.701 | accuracy: 0.691
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 229 | tp: 5165 | fn: 952
precision: 0.958 | recall: 0.844 | f-score: 0.897 | accuracy: 0.932
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 309 | tp: 935 | fn: 383
precision: 0.752 | recall: 0.709 | f-score: 0.730 | accuracy: 0.709
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 89 | tp: 1155 | fn: 291
precision: 0.928 | recall: 0.799 | f-score: 0.859 | accuracy: 0.876
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 1504 | tp: 3140 | fn: 1948
precision: 0.676 | recall: 0.617 | f-score: 0.645 | accuracy: 0.617
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 673 | tp: 3971 | fn: 1492
precision: 0.855 | recall: 0.727 | f-score: 0.786 | accuracy: 0.780
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 471 | tp: 1525 | fn: 1073
precision: 0.764 | recall: 0.587 | f-score: 0.664 | accuracy: 0.587
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 79 | tp: 1917 | fn: 921
precision: 0.960 | recall: 0.675 | f-score: 0.793 | accuracy: 0.738
------------------------------------------------------------------------



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall,Runtime,Samples Per Second
1,0.0914,0.067239,0.917605,0.922212,0.919903,0.981085,0.932446,0.903098,0.963766,6.4047,647.803
2,0.0558,0.062211,0.94162,0.944847,0.943231,0.985143,0.960787,0.965794,0.955832,6.263,662.462
3,0.0331,0.064573,0.939934,0.946601,0.943255,0.985959,0.954716,0.937086,0.973023,6.6067,627.997
4,0.0227,0.058285,0.953352,0.957759,0.95555,0.988891,0.967472,0.95586,0.979371,6.7075,618.56
5,0.0181,0.064807,0.951111,0.958237,0.954661,0.988359,0.968335,0.962141,0.97461,6.3179,656.704
6,0.0095,0.063948,0.955566,0.956404,0.955985,0.988631,0.967801,0.95802,0.977784,6.4174,646.525
7,0.0068,0.068787,0.95465,0.957998,0.956321,0.989052,0.970974,0.973167,0.968791,6.401,648.18
8,0.0041,0.072341,0.956311,0.957759,0.957034,0.989064,0.972303,0.969745,0.974874,6.4908,639.216
9,0.0033,0.076048,0.952577,0.958954,0.955755,0.988953,0.970654,0.965951,0.975403,6.475,640.772
10,0.0026,0.075387,0.955758,0.962461,0.959098,0.989497,0.971729,0.966266,0.977255,6.3476,653.632


  0%|          | 0/2 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 1664 | tp: 4108 | fn: 1435
precision: 0.712 | recall: 0.741 | f-score: 0.726 | accuracy: 0.741
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 300 | tp: 5472 | fn: 686
precision: 0.948 | recall: 0.889 | f-score: 0.917 | accuracy: 0.987
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 269 | tp: 945 | fn: 373
precision: 0.778 | recall: 0.717 | f-score: 0.746 | accuracy: 0.717
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 80 | tp: 1134 | fn: 291
precision: 0.934 | recall: 0.796 | f-score: 0.859 | accuracy: 0.860
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 1567 | tp: 3142 | fn: 1946
precision: 0.667 | recall: 0.618 | f-score: 0.641 | accuracy: 0.618
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 693 | tp: 4016 | fn: 1493
precision: 0.853 | recall: 0.729 | f-score: 0.786 | accuracy: 0.789
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 471 | tp: 1526 | fn: 1072
precision: 0.764 | recall: 0.587 | f-score: 0.664 | accuracy: 0.587
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 80 | tp: 1917 | fn: 926
precision: 0.960 | recall: 0.674 | f-score: 0.792 | accuracy: 0.738
------------------------------------------------------------------------



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall,Runtime,Samples Per Second
1,0.0919,0.070227,0.931263,0.922815,0.92702,0.981625,0.94488,0.957802,0.932302,6.1435,675.344
2,0.0507,0.067123,0.936449,0.938042,0.937245,0.984702,0.952991,0.944915,0.961207,6.168,672.665
3,0.0332,0.065747,0.940987,0.946627,0.943798,0.985537,0.959051,0.944663,0.973884,6.1513,674.486
4,0.0223,0.066737,0.951394,0.948004,0.949696,0.987269,0.968825,0.964564,0.973124,6.309,657.632
5,0.0135,0.071736,0.942147,0.948328,0.945227,0.986758,0.962636,0.958522,0.966785,6.107,679.384
6,0.0103,0.075651,0.952929,0.95262,0.952774,0.988203,0.969858,0.968877,0.970842,6.0367,687.3
7,0.0062,0.082167,0.949035,0.951648,0.95034,0.987356,0.966168,0.958572,0.973884,6.1271,677.157
8,0.004,0.080311,0.950886,0.95181,0.951348,0.987754,0.970157,0.967709,0.972617,6.2017,669.015
9,0.0022,0.084803,0.952976,0.951972,0.952474,0.988215,0.969383,0.967424,0.971349,6.1329,676.516


  0%|          | 0/2 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 1533 | tp: 3910 | fn: 1633
precision: 0.718 | recall: 0.705 | f-score: 0.712 | accuracy: 0.705
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 274 | tp: 5169 | fn: 934
precision: 0.950 | recall: 0.847 | f-score: 0.895 | accuracy: 0.933
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 278 | tp: 930 | fn: 388
precision: 0.770 | recall: 0.706 | f-score: 0.736 | accuracy: 0.706
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 73 | tp: 1135 | fn: 299
precision: 0.940 | recall: 0.791 | f-score: 0.859 | accuracy: 0.861
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 1536 | tp: 3107 | fn: 1981
precision: 0.669 | recall: 0.611 | f-score: 0.639 | accuracy: 0.611
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 667 | tp: 3976 | fn: 1507
precision: 0.856 | recall: 0.725 | f-score: 0.785 | accuracy: 0.781
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 452 | tp: 1509 | fn: 1089
precision: 0.770 | recall: 0.581 | f-score: 0.662 | accuracy: 0.581
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 66 | tp: 1895 | fn: 935
precision: 0.966 | recall: 0.670 | f-score: 0.791 | accuracy: 0.729
------------------------------------------------------------------------



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall,Runtime,Samples Per Second
1,0.0859,0.074931,0.926069,0.928854,0.92746,0.980779,0.948754,0.932819,0.965242,6.4852,639.615
2,0.0534,0.068275,0.936419,0.942941,0.939669,0.984332,0.961653,0.971121,0.952369,6.4003,648.095
3,0.0303,0.068522,0.949813,0.946581,0.948195,0.98638,0.967258,0.972417,0.962152,6.4208,646.026
4,0.0204,0.075673,0.94819,0.951567,0.949876,0.986195,0.96535,0.958852,0.971936,6.5008,638.078
5,0.0154,0.078049,0.955439,0.953625,0.954531,0.987712,0.971236,0.977569,0.964985,6.3943,648.707
6,0.0096,0.079842,0.959215,0.954812,0.957008,0.988082,0.970287,0.978028,0.962667,6.5743,630.943
7,0.006,0.081327,0.956581,0.951963,0.954266,0.9877,0.967825,0.967576,0.968074,6.2598,662.637
8,0.0046,0.081537,0.956088,0.956315,0.956202,0.987996,0.967089,0.962038,0.972194,6.5103,637.142
9,0.0037,0.083912,0.960981,0.955049,0.958006,0.988638,0.970368,0.971119,0.969619,6.4655,641.557
10,0.0021,0.084574,0.957845,0.956632,0.957238,0.988416,0.969939,0.972071,0.967817,6.555,632.799


  0%|          | 0/2 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 1540 | tp: 4169 | fn: 1374
precision: 0.730 | recall: 0.752 | f-score: 0.741 | accuracy: 0.752
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 264 | tp: 5445 | fn: 683
precision: 0.954 | recall: 0.889 | f-score: 0.920 | accuracy: 0.982
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 288 | tp: 935 | fn: 383
precision: 0.765 | recall: 0.709 | f-score: 0.736 | accuracy: 0.709
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 82 | tp: 1141 | fn: 295
precision: 0.933 | recall: 0.795 | f-score: 0.858 | accuracy: 0.866
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 1604 | tp: 3174 | fn: 1914
precision: 0.664 | recall: 0.624 | f-score: 0.643 | accuracy: 0.624
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 723 | tp: 4055 | fn: 1430
precision: 0.849 | recall: 0.739 | f-score: 0.790 | accuracy: 0.797
------------------------------------------------------------------------



  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation mode: strict
fp: 486 | tp: 1526 | fn: 1072
precision: 0.758 | recall: 0.587 | f-score: 0.662 | accuracy: 0.587
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 92 | tp: 1920 | fn: 918
precision: 0.954 | recall: 0.677 | f-score: 0.792 | accuracy: 0.739
------------------------------------------------------------------------

