In [2]:
import itertools

import numpy as np
import torch
from transformers import BertForTokenClassification, BertConfig, IntervalStrategy, TrainingArguments, Trainer
import datasets
from datasets.arrow_dataset import Dataset

import ruamel.yaml

import abctk.obj.comparative as aoc

import abct_comp_ner_utils.models.NER_with_root as nwr

tokenizer = nwr.get_tokenizer()

BATCH_SIZE = 20
OUTPUT_PATH = "../../results_2023-01-09"

In [4]:
dataset_raw = datasets.load_dataset(
    "abctreebank/comparative-NER-BCCWJ",
    use_auth_token = True,
    revision = "6c51f916ecd23c32e546a3d4f695c69d8c47e21e",
)

Using custom data configuration default-9ccbc70a477221d0
Found cached dataset parquet (/home/twotrees12/.cache/huggingface/datasets/abctreebank___parquet/default-9ccbc70a477221d0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

## Training

In [5]:
ds_train: Dataset = dataset_raw["train"]
ds_train = ds_train.map(
    lambda E: nwr.convert_annotation_entries_to_matrices(
        E,
        return_type = "pt",
    ),
    batched = True,
    batch_size = BATCH_SIZE,
    remove_columns = ds_train.column_names,
)

  0%|          | 0/156 [00:00<?, ?ba/s]

In [6]:
# train/eval split
ds_train_split = ds_train.train_test_split(test_size = 0.1, shuffle = True)

In [7]:
config = BertConfig.from_pretrained(
    nwr.BERT_MODEL,
    id2label = nwr.ID2LABEL,
    label2id = nwr.LABEL2ID,
)

model = BertForTokenClassification.from_pretrained(
    nwr.BERT_MODEL,
    config = config,
)

training_args = TrainingArguments(
        output_dir = OUTPUT_PATH,
        num_train_epochs = 27,
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 16,
        learning_rate = 5e-5,
        warmup_steps = 200,
        weight_decay = 0,
        save_strategy = IntervalStrategy.STEPS,
        save_steps = 1000,
        seed = 2630987289,
        logging_dir = f"{OUTPUT_PATH}/logs",
        logging_steps= 10,
    )

trainer = Trainer(
    model_init = lambda: model,
    args = training_args,
    train_dataset = ds_train_split["train"],
)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the m

In [None]:
trainer.train()
trainer.save_state()
trainer.save_model()

In [9]:
model.name_or_path = "cl-tohoku/bert-base-japanese-whole-word-masking"

# To push:
# model.push_to_hub(
#     "abctreebank/comparative-NER-with-root",
#     private = True,
#     use_auth_token = True,
# )
# tokenizer.push_to_hub(
#     "abctreebank/comparative-NER-with-root",
#     private = True,
#     use_auth_token = True,
# )

Configuration saved in /tmp/tmpiq57h0w9/config.json
Model weights saved in /tmp/tmpiq57h0w9/pytorch_model.bin
Uploading the following files to abctreebank/comparative-NER-with-root: config.json,pytorch_model.bin
tokenizer config file saved in /tmp/tmpypgve4j1/tokenizer_config.json
Special tokens file saved in /tmp/tmpypgve4j1/special_tokens_map.json
Uploading the following files to abctreebank/comparative-NER-with-root: tokenizer_config.json,vocab.txt,special_tokens_map.json


CommitInfo(commit_url='https://huggingface.co/abctreebank/comparative-NER-with-root/commit/ed1b1834de445a5fc998839677d6c40872c8ad3c', commit_message='Upload tokenizer', commit_description='', oid='ed1b1834de445a5fc998839677d6c40872c8ad3c', pr_url=None, pr_revision=None, pr_num=None)

## Evaluating

In [10]:
SAVED_PATH = OUTPUT_PATH

model = BertForTokenClassification.from_pretrained(
    SAVED_PATH,
).cuda()

loading configuration file ../../results_2023-01-09/config.json
Model config BertConfig {
  "_name_or_path": "cl-tohoku/bert-base-japanese-whole-word-masking",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "deg",
    "2": "prej",
    "3": "cont",
    "4": "diff",
    "5": "root"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "IGNORE": -100,
    "O": 0,
    "cont": 3,
    "deg": 1,
    "diff": 4,
    "prej": 2,
    "root": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertJapaneseTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "type_vocab_size": 

In [11]:
ds_test = dataset_raw["test"].map(
    lambda E: nwr.convert_annotation_entries_to_matrices(
        E,
        return_type = "pt",
    ),
    batched = True,
    batch_size = BATCH_SIZE,
)

  0%|          | 0/18 [00:00<?, ?ba/s]

In [12]:
def _predict(
    examples: datasets.arrow_dataset.Batch
):
    predictions_raw = model.forward(
        input_ids = torch.tensor(examples["input_ids"]).cuda(),
        attention_mask = torch.tensor(examples["attention_mask"]).cuda(),
        token_type_ids  = torch.tensor(examples["token_type_ids"]).cuda(),
        return_dict = True,
    )

    examples["label_ids_predicted"] = (
        predictions_raw.logits
        .argmax(dim = 2,)
        .detach()
        .cpu()
        .numpy()
    )

    return examples
# === END ===

ds_test = ds_test.map(
    lambda e: (
        nwr.convert_predictions_to_annotations(
            nwr.convert_predictions_to_annotations(
                _predict(e),
                label_ids_key = "label_ids_predicted",
                comp_key = "comp_predicted",
            ),
            label_ids_key = "label_ids",
            comp_key = "comp_subword_aligned",
        )
    ),
    batched = True,
    batch_size = BATCH_SIZE,
)

  0%|          | 0/18 [00:00<?, ?ba/s]

In [13]:
metric = nwr.NERWithRootMetrics()
metric.add_batch(
    predictions = ds_test["label_ids_predicted"],
    references = ds_test["label_ids"],
)
metric_result = metric.compute()
ds_test_with_alignments = ds_test.add_column(
    "errors",
    [
        [
            aoc.MatchSpanResult(jdg).name for _, jdg in itertools.chain(
                res.map_pred_to_ref,
                res.map_ref_to_pred
            )
            if jdg != aoc.MatchSpanResult.CORRECT
        ]
        for res in metric_result["alignments"]
    ]
)

In [14]:
def _linearize_comp(
    batch: datasets.arrow_dataset.Batch
) -> datasets.arrow_dataset.Batch:
    ls_reference_linear = []
    ls_prediction_linear = []
    batch_size = len(batch["ID"])

    for i in range(batch_size):
        ID = batch["ID"][i]
        tokens = tuple(
                    itertools.takewhile(
                lambda t: t not in ("[SEP]", "[PAD]"),
                batch["token_subwords"][i]
            )
        )

        ls_reference_linear.append(
            aoc.linearize_annotations(
                tokens,
                batch["comp_subword_aligned"][i],
            )
        )
        ls_prediction_linear.append(
            aoc.linearize_annotations(
                tokens,
                batch["comp_predicted"][i],
            )
        )

    batch["reference_linear"] = ls_reference_linear
    batch["prediction_linear"] = ls_prediction_linear

    return batch

ds_test_with_alignments = ds_test_with_alignments.map(
    _linearize_comp,
    batched = True,
    batch_size = BATCH_SIZE,
)

ds_test_dump = ds_test_with_alignments.remove_columns(
    [
        col for col in ds_test_with_alignments.column_names
        if col not in (
            "ID",
            "prediction_linear",
            "reference_linear",
            "alignments",
            "errors",
        )
    ]
)

  0%|          | 0/18 [00:00<?, ?ba/s]

In [15]:
yaml = ruamel.yaml.YAML()
with open("./result.yaml", "w") as f:
    yaml.dump(list(ds_test_dump), f)

In [16]:
metric_result["scores_spanwise"] = { 
    str(k) : v
    for k, v in metric_result["scores_spanwise"].items()
}

In [17]:
metric_result["F1_partial_average"]

0.8025059415015681

In [18]:
metric_result["F1_strict_average"]


0.702534994867713

In [19]:
for label, res in metric_result["scores_spanwise"].items():
    print(label)
    print(res["F1_partial"])
    print(res["F1_strict"])
    print()

root
0.7051070840197694
0.5205930807248764

cont
0.7308707124010554
0.6015831134564643

prej
0.905123339658444
0.8690702087286528

diff
0.8392857142857143
0.7499999999999999

deg
0.8321428571428571
0.7714285714285715



In [20]:
yaml = ruamel.yaml.YAML()
with open("scores.yaml", "w") as g:
    yaml.dump(
        {
            k : v for k, v in metric_result.items()
            if k != "alignments"
        },
        stream = g,
    )