In [10]:
"""Various helpers."""

__author__ = "Matteo Romanello"
__email__ = "matteo.romanello@unil.ch"
__organisation__ = "UNIL, ASA"
__status__ = "development"

import logging
from pathlib import Path
from collections import OrderedDict
from typing import NamedTuple, List, Dict
from cassis import Cas, load_cas_from_xmi, load_typesystem

BIBLIO_ENTITIES = [
    "primary-full",
    "primary-partial",
    "secondary-full",
    "secondary-partial",
    "secondary-meta"
]

HYPHENS = ["—", "⸗", "-", "-"]

AjmcDocument = NamedTuple(
    "AjmcDocument",
    [
        ("id", str),
        ("filename", str),
        ("filepath", str),
        ("sentences", dict),
        ("mentions", dict),
        ("hyphenated_words", list),
        ("links", list),
        ("text", str),
        ("images_links", List[dict]),
    ],
)


def read_xmi(xmi_file: str, xml_file: str, sanity_check: bool = True) -> AjmcDocument:
    """Parse CAS/XMI document.

    :param str xmi_file: path to xmi_file.
    :param str xml_file: path to xml schema file.
    :param bool sanity_check: Perform annotation-independent sanity check.
    :return: A namedtuple with all the annotation information.
    :rtype: AjmcDocument

    """

    neType = "webanno.custom.AjMCNamedEntity"
    segmentType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
    sentenceType = 'webanno.custom.GoldSentences'
    hyphenationType = 'webanno.custom.GoldHyphenation'
    tokenType = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"
    imageLinkType = "webanno.custom.AjMCImages"

    f_xmi = Path(xmi_file)
    filename = f_xmi.name
    filepath = str(f_xmi)
    docid = filename.split(".")[0]

    segments = OrderedDict()
    links = {}
    hyphenated_words = []
    mentions = OrderedDict()

    with open(xml_file, "rb") as f:
        typesystem = load_typesystem(f)

    with open(xmi_file, "rb") as f:
        cas = load_cas_from_xmi(f, typesystem=typesystem)

    #if sanity_check:
    #    check_entity_boundaries(cas.select(neType), tokenType, cas, filename)

    for hyphenation_annotation in cas.select(hyphenationType):
        hyphenated_words.append({
            "id": hyphenation_annotation.xmiID,
            "start_offset": hyphenation_annotation.begin,
            "end_offset": hyphenation_annotation.end,
            "surface": hyphenation_annotation.get_covered_text().replace(" ", "")
        })

    # read in the tokens from golden sentences
    for seg in cas.select(sentenceType):
        tokens = []
        for tok in cas.select_covered(tokenType, seg):
            # ignore empty tokens
            if not tok.get_covered_text():
                continue
            try:
                token = {
                    "id": tok.xmiID,
                    "ann_layer": tokenType,
                    "start_offset": tok.begin,
                    "end_offset": tok.end,
                    "surface": tok.get_covered_text(),
                    "segment_id": seg.xmiID,
                }

                tokens.append(token)
            except Exception as e:
                msg = f"Problem with token annotation {tok.xmiID} in {xmi_file}"
                logging.error(msg)

        segment = {
            "segment_id": seg.xmiID,
            "start_offset": seg.begin,
            "end_offset": seg.end,
            "tokens": tokens,
            "corrupted": seg.corrupted,
            "incomplete_continuing": seg.incomplete_continuing,
            "incomplete_truncated": seg.incomplete_truncated
        }

        segments[seg.xmiID] = segment

    # read in the named entities
    for i, ent in enumerate(cas.select(neType)):
        try:
            assert ent.value is not None

            entity = {
                "id": ent.xmiID,
                "id_cont": i,
                "ann_layer": neType,
                "entity_fine": ent.value,
                "entity_coarse": ent.value.split(".")[0] if "." in ent.value else ent.value,
                "entity_biblio": ent.value if ent.value in BIBLIO_ENTITIES else None,
                "start_offset": ent.begin,
                "end_offset": ent.end,
                "literal": "true",  # we don't have metonymy, so...
                "surface": ent.get_covered_text().replace("\n", ""),
                "noisy_ocr": ent.noisy_ocr,
                "transcript": ent.transcript,
            }

            if entity["noisy_ocr"]:
                if entity["transcript"]:
                    # entity["levenshtein_norm"] = compute_levenshtein_distance(
                    #     entity["surface"], entity["transcript"]
                    # )
                    pass
                else:
                    msg = f"Transcript for noisy entity {entity['surface']} is missing in {xmi_file}. Levenshtein distance cannot be computed and is set to 0."
                    logging.error(msg)
                    entity["levenshtein_norm"] = 0

            elif not entity["noisy_ocr"] and entity["transcript"]:
                msg = f"Transcript for entity {entity['surface']} is present in {xmi_file}, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless."
                logging.error(msg)
                # entity["levenshtein_norm"] = compute_levenshtein_distance(
                #     entity["surface"], entity["transcript"]
                # )

            else:
                entity["levenshtein_norm"] = 0

            mentions[ent.xmiID] = entity

            # read in the impresso links of named entity
            link = {
                "entity_id": ent.xmiID,
                "is_NIL": ent.is_NIL == "true",
                "wikidata_id": ent.wikidata_id,
            }

            links[ent.xmiID] = link

        except Exception as e:
            msg = f"Problem with entity annotation {ent.xmiID} in {xmi_file}"
            logging.error(e)
            logging.error(msg)
            #raise e
            #pdb.set_trace()

    bboxes = []
    for i, img_link in enumerate(cas.select(imageLinkType)):
        bboxes.append({'start_offset': img_link.begin,
                       'end_offset': img_link.end,
                       'surface': img_link.get_covered_text(),
                       'bbox': [int(e) for e in img_link.link.strip('/').split('/')[-4].split(',')]})



    document = AjmcDocument(
        docid,
        filename,
        filepath,
        segments,
        mentions,
        hyphenated_words,
        links,
        cas.sofa_string,
        images_links=bboxes
    )

    return document


In [11]:
doc = read_xmi(xmi_file='/Users/sven/packages/AjMC-NE-corpus/data/preparation/corpus/de/curated/Wecklein1894_0006.xmi',
               xml_file='/Users/sven/packages/AjMC-NE-corpus/data/preparation/TypeSystem.xml')


In [12]:
doc.images_links

[{'start_offset': 0,
  'end_offset': 3,
  'surface': 'Der',
  'bbox': [479, 1015, 126, 67]},
 {'start_offset': 0,
  'end_offset': 51,
  'surface': 'Der gewaltige Aias, der Sohn des Telamon, Enkel des',
  'bbox': [479, 1009, 2043, 90]},
 {'start_offset': 4,
  'end_offset': 13,
  'surface': 'gewaltige',
  'bbox': [656, 1016, 313, 83]},
 {'start_offset': 14,
  'end_offset': 19,
  'surface': 'Aias,',
  'bbox': [1017, 1011, 180, 80]},
 {'start_offset': 20,
  'end_offset': 23,
  'surface': 'der',
  'bbox': [1248, 1015, 100, 64]},
 {'start_offset': 24,
  'end_offset': 28,
  'surface': 'Sohn',
  'bbox': [1397, 1012, 190, 85]},
 {'start_offset': 29,
  'end_offset': 32,
  'surface': 'des',
  'bbox': [1638, 1013, 105, 65]},
 {'start_offset': 33,
  'end_offset': 41,
  'surface': 'Telamon,',
  'bbox': [1791, 1009, 335, 80]},
 {'start_offset': 42,
  'end_offset': 47,
  'surface': 'Enkel',
  'bbox': [2180, 1011, 188, 66]},
 {'start_offset': 48,
  'end_offset': 51,
  'surface': 'des',
  'bbox': [2417,