In [1]:
import itertools

import numpy as np
import torch
from transformers import BertForTokenClassification, BertConfig, IntervalStrategy, TrainingArguments, Trainer
import datasets
from datasets.arrow_dataset import Dataset

import ruamel.yaml

import abctk.obj.comparative as aoc

import abct_comp_ner_utils.models.NER_with_root as nwr

tokenizer = nwr.get_tokenizer()

BATCH_SIZE = 32
OUTPUT_PATH = "../../results_2022-12-27"

In [4]:
dataset_raw = datasets.load_dataset(
    "abctreebank/comparative-NER-BCCWJ",
    use_auth_token = True,
    revision = "b91b7660af70ab231111a61354463cd402c32034",
)

Downloading metadata:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/632 [00:00<?, ?B/s]

Using custom data configuration default-8bab31363ef54ad0


Downloading and preparing dataset None/None to /home/owner/.cache/huggingface/datasets/abctreebank___parquet/default-8bab31363ef54ad0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/395k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

  

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split:   0%|          | 0/3115 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/350 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /home/owner/.cache/huggingface/datasets/abctreebank___parquet/default-8bab31363ef54ad0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

## Training

In [5]:
ds_train: Dataset = dataset_raw["train"]
ds_train = ds_train.map(
    lambda E: nwr.convert_annotation_entries_to_matrices(
        E,
        return_type = "pt",
    ),
    batched = True,
    batch_size = BATCH_SIZE,
    remove_columns = ds_train.column_names,
)

  0%|          | 0/98 [00:00<?, ?ba/s]

In [6]:
# train/eval split
ds_train_split = ds_train.train_test_split(test_size = 0.1, shuffle = True)

In [7]:
config = BertConfig.from_pretrained(
    nwr.BERT_MODEL,
    id2label = nwr.ID2LABEL,
    label2id = nwr.LABEL2ID,
)

model = BertForTokenClassification.from_pretrained(
    nwr.BERT_MODEL,
    config = config,
)

training_args = TrainingArguments(
        output_dir = OUTPUT_PATH,
        num_train_epochs = 27,
        per_device_train_batch_size = 64,
        per_device_eval_batch_size = 128,
        learning_rate = 5e-5,
        warmup_steps = 200,
        weight_decay = 0,
        save_strategy = IntervalStrategy.STEPS,
        save_steps = 1000,
        seed = 2630987289,
        logging_dir = f"{OUTPUT_PATH}/logs",
        logging_steps= 10,
    )

trainer = Trainer(
    model_init = lambda: model,
    args = training_args,
    train_dataset = ds_train_split["train"],
)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the m

In [8]:
trainer.train()
trainer.save_state()
trainer.save_model()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: token_subwords. If token_subwords are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2803
  Num Epochs = 27
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1188
  Number of trainable parameters = 110031366


  0%|          | 0/1188 [00:00<?, ?it/s]

{'loss': 1.7391, 'learning_rate': 2.5e-06, 'epoch': 0.23}
{'loss': 1.48, 'learning_rate': 5e-06, 'epoch': 0.45}
{'loss': 0.9458, 'learning_rate': 7.5e-06, 'epoch': 0.68}
{'loss': 0.3415, 'learning_rate': 1e-05, 'epoch': 0.91}
{'loss': 0.2201, 'learning_rate': 1.25e-05, 'epoch': 1.14}
{'loss': 0.1517, 'learning_rate': 1.5e-05, 'epoch': 1.36}
{'loss': 0.1165, 'learning_rate': 1.75e-05, 'epoch': 1.59}
{'loss': 0.0927, 'learning_rate': 2e-05, 'epoch': 1.82}
{'loss': 0.0753, 'learning_rate': 2.25e-05, 'epoch': 2.05}
{'loss': 0.0613, 'learning_rate': 2.5e-05, 'epoch': 2.27}
{'loss': 0.0509, 'learning_rate': 2.7500000000000004e-05, 'epoch': 2.5}
{'loss': 0.05, 'learning_rate': 3e-05, 'epoch': 2.73}
{'loss': 0.0481, 'learning_rate': 3.2500000000000004e-05, 'epoch': 2.95}
{'loss': 0.0402, 'learning_rate': 3.5e-05, 'epoch': 3.18}
{'loss': 0.0351, 'learning_rate': 3.7500000000000003e-05, 'epoch': 3.41}
{'loss': 0.0304, 'learning_rate': 4e-05, 'epoch': 3.64}
{'loss': 0.0337, 'learning_rate': 4.25e

Saving model checkpoint to ../../results_2022-12-27/checkpoint-1000
Configuration saved in ../../results_2022-12-27/checkpoint-1000/config.json


{'loss': 0.0015, 'learning_rate': 9.51417004048583e-06, 'epoch': 22.73}


Model weights saved in ../../results_2022-12-27/checkpoint-1000/pytorch_model.bin


{'loss': 0.0017, 'learning_rate': 9.008097165991904e-06, 'epoch': 22.95}
{'loss': 0.0014, 'learning_rate': 8.502024291497976e-06, 'epoch': 23.18}
{'loss': 0.0016, 'learning_rate': 7.99595141700405e-06, 'epoch': 23.41}
{'loss': 0.0015, 'learning_rate': 7.489878542510122e-06, 'epoch': 23.64}
{'loss': 0.0015, 'learning_rate': 6.983805668016195e-06, 'epoch': 23.86}
{'loss': 0.0014, 'learning_rate': 6.4777327935222675e-06, 'epoch': 24.09}
{'loss': 0.0016, 'learning_rate': 5.971659919028341e-06, 'epoch': 24.32}
{'loss': 0.0012, 'learning_rate': 5.465587044534413e-06, 'epoch': 24.55}
{'loss': 0.0013, 'learning_rate': 4.9595141700404865e-06, 'epoch': 24.77}
{'loss': 0.0014, 'learning_rate': 4.453441295546559e-06, 'epoch': 25.0}
{'loss': 0.0013, 'learning_rate': 3.9473684210526315e-06, 'epoch': 25.23}
{'loss': 0.0012, 'learning_rate': 3.4412955465587043e-06, 'epoch': 25.45}
{'loss': 0.0013, 'learning_rate': 2.9352226720647772e-06, 'epoch': 25.68}
{'loss': 0.0014, 'learning_rate': 2.429149797570



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ../../results_2022-12-27
Configuration saved in ../../results_2022-12-27/config.json


{'train_runtime': 643.5029, 'train_samples_per_second': 117.608, 'train_steps_per_second': 1.846, 'train_loss': 0.05119953726603327, 'epoch': 27.0}


Model weights saved in ../../results_2022-12-27/pytorch_model.bin


In [23]:
model.name_or_path = "cl-tohoku/bert-base-japanese-whole-word-masking"

# # To push:
# model.push_to_hub(
#     "abctreebank/comparative-NER-with-root",
#     private = True,
#     use_auth_token = True,
# )
# tokenizer.push_to_hub(
#     "abctreebank/comparative-NER-with-root",
#     private = True,
#     use_auth_token = True,
# )

Configuration saved in /tmp/tmpifit4vw4/config.json
Model weights saved in /tmp/tmpifit4vw4/pytorch_model.bin
Uploading the following files to abctreebank/comparative-NER-with-root: config.json,pytorch_model.bin
tokenizer config file saved in /tmp/tmp8v3h2m_g/tokenizer_config.json
Special tokens file saved in /tmp/tmp8v3h2m_g/special_tokens_map.json
Uploading the following files to abctreebank/comparative-NER-with-root: vocab.txt,special_tokens_map.json,tokenizer_config.json


CommitInfo(commit_url='https://huggingface.co/abctreebank/comparative-NER-with-root/commit/9cc8fa8cd2b69169be881020fc630acecf57ed3f', commit_message='Upload tokenizer', commit_description='', oid='9cc8fa8cd2b69169be881020fc630acecf57ed3f', pr_url=None, pr_revision=None, pr_num=None)

## Evaluating

In [9]:
SAVED_PATH = OUTPUT_PATH

model = BertForTokenClassification.from_pretrained(
    SAVED_PATH,
).cuda()

loading configuration file ../../results_2022-12-27/config.json
Model config BertConfig {
  "_name_or_path": "cl-tohoku/bert-base-japanese-whole-word-masking",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "deg",
    "2": "prej",
    "3": "cont",
    "4": "diff",
    "5": "root"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "IGNORE": -100,
    "O": 0,
    "cont": 3,
    "deg": 1,
    "diff": 4,
    "prej": 2,
    "root": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertJapaneseTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "type_vocab_size": 

In [10]:
ds_test = dataset_raw["test"].map(
    lambda E: nwr.convert_annotation_entries_to_matrices(
        E,
        return_type = "pt",
    ),
    batched = True,
    batch_size = BATCH_SIZE,
)

  0%|          | 0/11 [00:00<?, ?ba/s]

In [11]:
def _predict(
    examples: datasets.arrow_dataset.Batch
):
    predictions_raw = model.forward(
        input_ids = torch.tensor(examples["input_ids"]).cuda(),
        attention_mask = torch.tensor(examples["attention_mask"]).cuda(),
        token_type_ids  = torch.tensor(examples["token_type_ids"]).cuda(),
        return_dict = True,
    )

    examples["label_ids_predicted"] = (
        predictions_raw.logits
        .argmax(dim = 2,)
        .detach()
        .cpu()
        .numpy()
    )

    return examples
# === END ===

ds_test = ds_test.map(
    lambda e: (
        nwr.convert_predictions_to_annotations(
            nwr.convert_predictions_to_annotations(
                _predict(e),
                label_ids_key = "label_ids_predicted",
                comp_key = "comp_predicted",
            ),
            label_ids_key = "label_ids",
            comp_key = "comp_subword_aligned",
        )
    ),
    batched = True,
    batch_size = BATCH_SIZE,
)

  0%|          | 0/11 [00:00<?, ?ba/s]

In [12]:
metric = nwr.NERWithRootMetrics()
metric.add_batch(
    predictions = ds_test["label_ids_predicted"],
    references = ds_test["label_ids"],
)
metric_result = metric.compute()
ds_test_with_alignments = ds_test.add_column(
    "alignments",
    metric_result["alignments"],
)

In [13]:
def _linearize_comp(
    batch: datasets.arrow_dataset.Batch
) -> datasets.arrow_dataset.Batch:
    ls_reference_linear = []
    ls_prediction_linear = []
    batch_size = len(batch["ID"])

    for i in range(batch_size):
        ID = batch["ID"][i]
        tokens = tuple(
                    itertools.takewhile(
                lambda t: t not in ("[SEP]", "[PAD]"),
                batch["token_subwords"][i]
            )
        )

        ls_reference_linear.append(
            aoc.linearize_annotations(
                tokens,
                batch["comp_subword_aligned"][i],
            )
        )
        ls_prediction_linear.append(
            aoc.linearize_annotations(
                tokens,
                batch["comp_predicted"][i],
            )
        )

    batch["reference_linear"] = ls_reference_linear
    batch["prediction_linear"] = ls_prediction_linear

    return batch

ds_test_with_alignments = ds_test_with_alignments.map(
    _linearize_comp,
    batched = True,
    batch_size = BATCH_SIZE,
)

ds_test_dump = ds_test_with_alignments.remove_columns(
    [
        col for col in ds_test_with_alignments.column_names
        if col not in (
            "ID",
            "prediction_linear",
            "reference_linear",
            "alignments",
        )
    ]
)

  0%|          | 0/11 [00:00<?, ?ba/s]

In [14]:
yaml = ruamel.yaml.YAML()
with open("./result.yaml", "w") as f:
    yaml.dump(
        [entry for entry in ds_test_dump],
        f
    )

In [15]:
metric_result["scores_spanwise"] = { 
    str(k) : v
    for k, v in metric_result["scores_spanwise"].items()
}

In [16]:
metric_result["F1_partial_average"]

0.7828014030294053

In [17]:
metric_result["F1_strict_average"]


0.6797199253987477

In [18]:
for label, res in metric_result["scores_spanwise"].items():
    print(label)
    print(res["F1_partial"])
    print(res["F1_strict"])
    print()

root
0.6705882352941176
0.4823529411764706

cont
0.6948356807511737
0.5586854460093897

prej
0.8886956521739129
0.8521739130434782

diff
0.8362573099415204
0.7485380116959065

deg
0.8236301369863015
0.7568493150684932



In [19]:
yaml = ruamel.yaml.YAML()
with open("scores.yaml", "w") as g:
    yaml.dump(
        {
            k : v for k, v in metric_result.items()
            if k != "alignments"
        },
        stream = g,
    )