In [41]:
import csv
import re
import json
import itertools

import datasets

import abctk.obj.comparative as aoc

In [17]:
ANNOT_FILE_PATH = "/home/owner/ABCT/comp-proto/comparative-annotation_linearized_2023-02-14.txt"

In [14]:
# Load previous dataset from the HF repo
dataset_raw = datasets.load_dataset(
    "abctreebank/comparative-NER-BCCWJ",
    use_auth_token = True,
    revision = "e3cdaf016f1fba88d10194500c313f951b0d2df3",
)

# Just in order to and comments
dataset_train = dataset_raw["train"]
dataset_test = dataset_raw["test"]

Using custom data configuration default-935290dee194d9be
Found cached dataset parquet (/home/owner/.cache/huggingface/datasets/abctreebank___parquet/default-935290dee194d9be/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [30]:
# index datasets
dataset_indexed: dict[str, dict] = dict(
    (item["ID"], item)
    for item in datasets.concatenate_datasets(
        [dataset_raw["train"], dataset_test ]
    )
)

In [47]:
# Load current annotation file
with open(ANNOT_FILE_PATH) as g:
    annots = tuple(aoc.read_bracket_annotation_file(g))

In [48]:
# Incorporate comments
for annot_record in annots:
    current_ID = annot_record["ID"]
    record_in_dataset = dataset_indexed[current_ID]

    annot_record["comments"] = list(
        set(
            itertools.chain(
                annot_record.get("comments", []),
                record_in_dataset.get("comments", []),
            )
        )
    )

In [49]:
# laod newly found IDs
with open("./ID-mapping.csv") as f:
    reader = csv.DictReader(
        f,
        fieldnames = [
            "ID_current",
            "found_bccwj_file",
            "found_bccwj_start",
            "correct_bccwj_file",
            "correct_bccwj_start"
        ],
        dialect="excel",
    )

    _ = next(reader)

    # make indices
    mapping = dict( 
        (row["ID_current"], row) 
        for row in reader
    )

In [50]:
_RE_kurabe = re.compile(r"kurabe")

def generate_new_ID(ID_current: str, bccwj_file, bccwj_start)-> str:
    kind = (
        "kurabe"
        if _RE_kurabe.search(ID_current) 
        else "yori"
    )

    if bccwj_file and bccwj_start:
        return f"ABCT-COMP-BCCWJ;{kind};{bccwj_file},{bccwj_start}"
    else:
        return f"ABCT-COMP-BCCWJ;{kind};UNKNOWN,UNKNOWN"

In [51]:
# Change IDs
for annot_record in annots:
    current_ID = annot_record["ID"]
    annot_record["ID_v1"] = current_ID
    match = mapping[current_ID]
    if match["correct_bccwj_file"] and match["correct_bccwj_start"]:
        annot_record["ID"] = generate_new_ID(
            current_ID,
            match["correct_bccwj_file"],
             match["correct_bccwj_start"],
        )
    else:
        annot_record["ID"] = generate_new_ID(
            current_ID,
            match["found_bccwj_file"],
            match["found_bccwj_start"],
        )

In [54]:
# collect test sentence IDs
test_IDs = set(record["ID"] for record in dataset_test)

# Split train/test
annots_train = tuple(
    r for r in annots
    if r["ID_v1"] not in test_IDs
)

annots_test = tuple(
    r for r in annots
    if r["ID_v1"] in test_IDs
)

In [55]:
with open("train.jsonl", "w") as f_train:
    for record in annots_train:
        json.dump(record, f_train, ensure_ascii = False)
        f_train.write("\n")

with open("test.jsonl", "w") as f_test:
    for record in annots_test:
        json.dump(record, f_test, ensure_ascii = False)
        f_test.write("\n")

Put the generated two JSONL files to the dataset folder and do HF push_to_hub.

```sh
mv train.sjon test.jsonl ../comparative-NER-BCCWJ/
abct-comp-ner-utils upload-data --private ./comparative-NER-BCCWJ
```

WARNING: do not stage any annotation data to this repo so as to keep them private.