## Imports

In [46]:
import sys
import pandas as pd
sys.path.append('lib/')
from ajmc_utils import AjmcDocument, read_xmi
from convert_xmi2clef_format import index_inception_files

## Functions

In [67]:
def extract_entity_info(doc: AjmcDocument, lang: str):
    entities = []
    for mention in doc.mentions.values():
        if (
            "-full" not in mention['entity_fine'] and
            "-partial" not in mention['entity_fine'] and
            mention['entity_fine'] != "scope"
            ):
            entity_linking_info = doc.links[mention['id']]
            entities.append({
                "surface": mention['transcript'] if mention['transcript'] else mention['surface'],
                "ocr_transcript": mention['surface'],
                "gold_transcript": mention['transcript'],
                "noisy_entity": False if mention['levenshtein_norm'] == 0 else True,
                "document_id": doc.id,
                "language": lang,
                "entity_fine_type": mention['entity_fine'],
                "wikidata_id": entity_linking_info['wikidata_id'] if not entity_linking_info['is_NIL'] else "NIL"
            })
    return entities

## Paths

In [27]:
paths = {
    "de": "data/preparation/corpus/de/retokenized/",
    "en": "data/preparation/corpus/en/retokenized/",
    "fr": "data/preparation/corpus/fr/retokenized/"
}

## Logic

- go to data/preparation and index inception files
- read files into `AjmcDocument` with `read_xmi`
- use a function to extract info about entities
- load entities into a dataframe
- filter only the types we are interested in
- do a groupby on surface + QID to find out variations in the assigned entity links

In [68]:
all_entities = []

for language in paths:
    entities_batch = []
    path = paths[language]
    xmi_files = index_inception_files(path)
    print(f"{len(xmi_files)} files in {language} dataset")
    
    for xmi_file in xmi_files:
        doc = read_xmi(xmi_file, xml_file="data/preparation/TypeSystem.xml", sanity_check=False)
        entities_batch += extract_entity_info(doc, lang=language)
    
    print(f"Found {len(entities_batch)} entities in {len(files)} files")
    all_entities += entities_batch
    

106 files in de dataset


ERROR:root:Transcript for entity Καὶ 385 is present in data/preparation/corpus/de/retokenized/Wecklein1894_0036.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for entity Od. is present in data/preparation/corpus/de/retokenized/Wecklein1894_0050.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for entity 1 8 48, 13, 10 is present in data/preparation/corpus/de/retokenized/Wecklein1894_0052.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for entity Od. T. is present in data/preparation/corpus/de/retokenized/Wecklein1894_0059.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for entity ΕἸ. is present in data/preparation/corpus/de/retokenized/sophokle1v3soph_0057.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERRO

Found 1452 entities in 106 files
86 files in en dataset


ERROR:root:Transcript for entity Electra, Il. 201, 690-5, 1379 foll. is present in data/preparation/corpus/en/retokenized/cu31924087948174_0011.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for entity Ajax, 11. 855, 966-71, 994, 5 is present in data/preparation/corpus/en/retokenized/cu31924087948174_0011.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for entity E.onL.§ 13. Ρ. 10 6 is present in data/preparation/corpus/en/retokenized/cu31924087948174_0047.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for noisy entity Hdt. 9 37 is missing in data/preparation/corpus/en/retokenized/cu31924087948174_0055.xmi. Levenshtein distance cannot be computed and is set to 0.
ERROR:root:Transcript for entity L. and 5. is present in data/preparation/corpus/en/retokenized/cu31924087948174_0084.xmi, yet entity is not ma

Found 1620 entities in 106 files
104 files in fr dataset


ERROR:root:Transcript for entity Cy- γορέάϊε is present in data/preparation/corpus/fr/retokenized/lestragdiesdeso00tourgoog_0057.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for entity OEdipe Roi is present in data/preparation/corpus/fr/retokenized/lestragdiesdeso00tourgoog_0061.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for entity vers 4052 is present in data/preparation/corpus/fr/retokenized/lestragdiesdeso00tourgoog_0062.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for entity 4269 is present in data/preparation/corpus/fr/retokenized/lestragdiesdeso00tourgoog_0063.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for entity 536 εἰ oo7 is present in data/preparation/corpus/fr/retokenized/lestragdiesdeso00tourgoog_0063.xmi, yet entity is

ERROR:root:Transcript for entity T. Liv. is present in data/preparation/corpus/fr/retokenized/lestragdiesdeso00tourgoog_0138.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for entity TLI, 99 is present in data/preparation/corpus/fr/retokenized/lestragdiesdeso00tourgoog_0140.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for entity Ἑατίρίάε is present in data/preparation/corpus/fr/retokenized/lestragdiesdeso00tourgoog_0143.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for entity Ale. is present in data/preparation/corpus/fr/retokenized/lestragdiesdeso00tourgoog_0145.xmi, yet entity is not marked as noisy. Levenshtein distance is computed nevertheless.
ERROR:root:Transcript for entity 11, 8 is present in data/preparation/corpus/fr/retokenized/lestragdiesdeso00tourgoog_0147.xmi, yet entity is not marked a

Found 1446 entities in 106 files


In [69]:
all_entities[:5]

[{'surface': 'Aias',
  'ocr_transcript': 'Aias',
  'gold_transcript': None,
  'noisy_entity': False,
  'document_id': 'Wecklein1894_0006',
  'language': 'de',
  'entity_fine_type': 'pers.myth',
  'wikidata_id': 'http://www.wikidata.org/entity/Q172725'},
 {'surface': 'Telamon',
  'ocr_transcript': 'Telamon',
  'gold_transcript': None,
  'noisy_entity': False,
  'document_id': 'Wecklein1894_0006',
  'language': 'de',
  'entity_fine_type': 'pers.myth',
  'wikidata_id': 'http://www.wikidata.org/entity/Q331327'},
 {'surface': 'Aakos',
  'ocr_transcript': 'Aakos',
  'gold_transcript': None,
  'noisy_entity': False,
  'document_id': 'Wecklein1894_0006',
  'language': 'de',
  'entity_fine_type': 'pers.myth',
  'wikidata_id': 'http://www.wikidata.org/entity/Q206187'},
 {'surface': 'Zeus',
  'ocr_transcript': 'Zeus',
  'gold_transcript': None,
  'noisy_entity': False,
  'document_id': 'Wecklein1894_0006',
  'language': 'de',
  'entity_fine_type': 'pers.myth',
  'wikidata_id': 'http://www.wikidat

In [70]:
len(all_entities)

4518

In [71]:
entities_df = pd.DataFrame(all_entities)

In [72]:
entities_df.shape

(4518, 8)

In [73]:

entities_df

Unnamed: 0,surface,ocr_transcript,gold_transcript,noisy_entity,document_id,language,entity_fine_type,wikidata_id
0,Aias,Aias,,False,Wecklein1894_0006,de,pers.myth,http://www.wikidata.org/entity/Q172725
1,Telamon,Telamon,,False,Wecklein1894_0006,de,pers.myth,http://www.wikidata.org/entity/Q331327
2,Aakos,Aakos,,False,Wecklein1894_0006,de,pers.myth,http://www.wikidata.org/entity/Q206187
3,Zeus,Zeus,,False,Wecklein1894_0006,de,pers.myth,http://www.wikidata.org/entity/Q34201
4,Achilleus,Achilleus,,False,Wecklein1894_0006,de,pers.myth,http://www.wikidata.org/entity/Q41746
...,...,...,...,...,...,...,...,...
4513,Le même,Le même,,False,lestragdiesdeso00tourgoog_0155,fr,pers.author,http://www.wikidata.org/entity/Q18411800
4514,Teucer,Teucer,,False,lestragdiesdeso00tourgoog_0155,fr,pers.myth,http://www.wikidata.org/entity/Q878184
4515,Eschyle,Eschyle,,False,lestragdiesdeso00tourgoog_0155,fr,pers.author,http://www.wikidata.org/entity/Q40939
4516,Agamemnon,Agamem- πον,Agamemnon,True,lestragdiesdeso00tourgoog_0155,fr,work.primlit,http://www.wikidata.org/entity/Q128176


In [74]:
entities_df.entity_fine_type.unique()

array(['pers.myth', 'pers.author', 'work.primlit', 'loc', 'date',
       'pers.other', 'work.fragm', 'pers.editor', 'work.seclit',
       'object.manuscr', 'work.journal', 'work.other', 'secondary-meta'],
      dtype=object)

In [117]:
entities_df.to_csv("all_entities_info.csv", sep="\t")

In [82]:
entities_grouped = entities_df.groupby(['surface', 'entity_fine_type', 'wikidata_id'])

In [115]:
entities_grouped.agg(
    {
        "document_id": lambda x: ", ".join(x.unique()),
        "entity_fine_type": lambda x: ", ".join(x.unique()),
    }
).sort_values(by="surface").to_csv('data_cleaning.csv', sep="\t")