In [26]:
import pandas as pd
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
import os
from tqdm import tqdm

from ddp_util import decompose_chatomid
from ddp_util import chatomid_to_url

# Helpers

In [None]:
def explode_columns(dataframe, columns=None):
    """Explodes one or more pandas columns in a DataFrame so each row contains only one object.
    """
    all_columns = dataframe.keys().to_list()
    if columns is None:
        columns = all_columns
    for column in columns:
        dataframe = dataframe.explode(column)
    return dataframe

def get_cei_date_value(row):
    date = row["cei_date_ATTRIBUTE_value"]
    date_bot = row["cei_dateRange_ATTRIBUTE_from"]
    date_top = row["cei_dateRange_ATTRIBUTE_to"]
    value = next((d for d in (date, date_bot, date_top) if pd.notna(d)), None)
    return value

def is_valid_date(date_value):
    return isinstance(date_value, str) and ("9999" not in date_value and "010101" not in date_value)

# Load

In [68]:
df_raw = pd.read_json("../../data/output/charters_full_2022-11-22-1044.json")

In [69]:
# from xquery
df_transcriptions = pd.read_xml("../../data/in/latin-ocr-transcriptions/latin_tenor_result.xml")
df_transcriptions.columns = ["atom_id", "tenor"]
df_transcriptions["google_ocr"] = True

# Prepare main

In [71]:
df_prep = explode_columns(
    df_raw, ["atom_id","cei_date_ATTRIBUTE_value", "cei_dateRange_ATTRIBUTE_from", "cei_dateRange_ATTRIBUTE_to"]
)

df_prep = df_prep[~df_prep["atom_id"].astype(str).str.contains("cheim")]

# Which forms of Latin are there in the cei?

In [None]:
lang_list_prep = explode_columns(df_raw, ["cei_lang_MOM"])
lang_list = lang_list_prep.cei_lang_MOM.value_counts().to_frame().sort_values(by="cei_lang_MOM", ascending=False)
lang_list.reset_index()["index"].to_list()

In [None]:
included_languages = [
    "Latein",
    "latein",
    "Lat.",
    "lat.",
    "latinsky",
    "latinski",
    "Latin",
    "latin",
    "latinščina"
    ]

['Deutsch', 'Latin', 'dt.', 'Latein', 'lat.', 'latinsky', 'latinský', 'německy', 'deutsch', 'česky', 'nem.', 'ger', 'nemški', 'lateinisch', 'latein.', 'nemščina', 'lat. ', 'němčina', 'český', 'Német', 'nem. ', 'lat', 'latina', 'německý', 'latinščina', 'latinski', 'niederdeutsch', 'latein', 'Niederdeutsch', 'čeština', 'russisch', 'schwedisch', 'Ndt.', 'Latino', 'Lateinisch', 'Latin.', 'Hochdeutsch', 'Hdt.', 'čes.', 'dt', 'D', 'französisch', 'LAT', 'nemecký', 'j. łaciński', 'latinsky/česky', 'dt. und lat.', 'Deutsch [?]', 'česky, německy', 'talijanski i latinski', ' Latein', 'plattdeutsch', 'Französisch', 'Altserbisch.', 'francouzsky', 'lat. u. dt.', 'latinšcina', 'latinski i talijanski', 'Slavic.', 'German', 'Olasz', 'CZ', 'Latină', 'lateinisch und deutsch', 'italsky', 'latinsky, německy', 'fre', 'nemšcina', 'lat., dt.', 'něm.', 'dt. u. lat.', 'česky, latinsky', 'englisch', 'slov.', 'Deutsch, Latein', 'latinsky, (italsky)', 'německy, česky', 'Perg.', 'Français', 'F', 'română (grafie chi

# Which collections contain OCR material?

In [72]:
namespaces = {"atom": "http://www.w3.org/2005/Atom", "cei": "http://www.monasterium.net/NS/cei", "xrx": "http://www.monasterium.net/NS/xrx"}
directoryPath = "../../data/db/mom-data/metadata.collection.public"
fileExtension = ".cei.xml"

In [73]:
def get_file_paths(directory):
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_file_paths(entry.path)
        else:
            continue

In [74]:
paths = [f"{PurePosixPath(path)}" for path in get_file_paths(directoryPath)]

In [75]:
lists = ["atom_id", "sourceDesc_p"]
atom_id, sourceDesc_p = ([] for i in range(len(lists)))

In [76]:
for file in tqdm(paths):
    with open(file, "r", encoding="utf-8") as f:
        tree = etree.parse(f)
        atom_id.append(tree.xpath("/atom:entry/atom:id/text()", namespaces = namespaces, smart_strings = False))
        sourceDesc_p.append(tree.xpath("/atom:entry/atom:content/cei:cei/cei:teiHeader/cei:fileDesc/cei:sourceDesc/cei:p/text()", namespaces = namespaces, smart_strings = False))

100%|██████████| 195/195 [00:04<00:00, 44.76it/s]


In [77]:
contents = list(zip(atom_id, sourceDesc_p))
contents_full = pd.DataFrame(contents).rename(columns={0: "atom_id", 1: "sourceDesc_p"})

In [78]:
contents_full_exploded = explode_columns(contents_full, lists)

In [79]:
export_inclusion_list = ["Export aus Google Daten"]

In [80]:
collections_from_google = contents_full_exploded[contents_full_exploded["sourceDesc_p"].isin(export_inclusion_list)].atom_id.to_list()

# Which charters are contained in these collections?

In [81]:
charter_atomids_with_google_ocr = []

In [82]:
charter_atomids_list = df_prep["atom_id"].to_list()

In [83]:
for atomid in charter_atomids_list:
    _, _, collection_atomid = decompose_chatomid(atomid)
    if collection_atomid in collections_from_google:
        charter_atomids_with_google_ocr.append(atomid)

In [84]:
len(charter_atomids_with_google_ocr)

40534

# List charters with transcriptions

In [88]:
# from xquery
with open("../../data/in/latin-ocr-transcriptions/charters_varieties_latin_results.xml", "r", encoding="utf-8") as f:
    tree = etree.parse(f)

data = []

for tenor in tree.xpath("//tenor"):
    atomid = tenor.get("atomid")
    text = tenor.text
    data.append((atomid, text))

charters_with_latin_transcriptions = pd.DataFrame(data, columns=["atom_id", "tenor"])
charters_with_latin_transcriptions["google_ocr"] = False

In [89]:
# check if there is overlap
charters_with_latin_transcriptions[charters_with_latin_transcriptions["atom_id"].isin(charter_atomids_with_google_ocr)]

Unnamed: 0,atom_id,tenor,google_ocr


In [109]:
# check duplicates
df_full = pd.concat([df_transcriptions, charters_with_latin_transcriptions])
print(len(df_full))
lentest = df_full.atom_id.drop_duplicates(keep="first")
print(len(lentest))

49784
49784


In [132]:
# merge with full
df_merged = df_prep.merge(df_full, on="atom_id")

In [133]:
df_prep_applied = df_merged.copy()

In [134]:
df_prep_applied["date_joined"] = df_prep_applied.apply(get_cei_date_value, axis=1)
#df_prep_filtered = df_prep_applied.loc[df_prep_applied["date_joined"].apply(is_valid_date)]

In [135]:
df_prep_applied

Unnamed: 0,atom_id,cei_abstract_joined,cei_abstract_foreign,cei_tenor_joined,cei_pTenor,cei_placeName,cei_lang_MOM,cei_date,cei_dateRange,cei_date_ATTRIBUTE_value,cei_dateRange_ATTRIBUTE_from,cei_dateRange_ATTRIBUTE_to,cei_graphic_ATTRIBUTE_url_orig,cei_graphic_ATTRIBUTE_url_copy,tenor,google_ocr,date_joined
0,"tag:www.monasterium.net,2011:/charter/7b78733a...",Kaiser Ferdinand II weist seine zu Wien hinter...,[],"[fol.1r, rechts (Seite 1)] Ferdinand der Ander...",[],[Prag (Cz)],"[Deutsch, Latein]",[],[1628 Mai 16. - Mai 24.],,16280516,16280524,[https://www.dropbox.com/s/s72sqsfwpzonnam/Fer...,[],"[fol.1r, rechts (Seite 1)] Ferdinand der Ander...",False,16280516
1,"tag:www.monasterium.net,2011:/charter/AbbayeDe...",,[],"EXEMPLAR CARTE GERARDI, MORINOHL'M EPISCOPI, ...","[ EXEMPLAR CARTE GERARDI, MORINOHL'M EPISCOPI,...",[],[],[18 oct. 1097.],[],10971018,,,[],"[00000352.png, 00000353.png, 00000354.png]","EXEMPLAR CARTE GERARDI, MORINOHL'M EPISCOPI, D...",True,10971018
2,"tag:www.monasterium.net,2011:/charter/AbbayeDe...",,[],"Exterius vero feoda, que antecessores ejus mi...","[ Exterius vero feoda, que antecessores ejus m...",[],[],[],[],99999999,,,[],"[00000386.png, 00000386.png]","Exterius vero feoda, que antecessores ejus min...",True,99999999
3,"tag:www.monasterium.net,2011:/charter/AbbayeDe...","DE EMPTIONE HARDRADI, ABBATIS, DE LONINGAHEM.",[],"XL. Emit quoque isdem abbas Hardradus, anno ...","[ XL. , Emit quoque isdem abbas Hardradus, an...",[],[],[],[776],,7760101,7761231,[],[00000170.png],"XL. Emit quoque isdem abbas Hardradus, anno v...",True,7760101
4,"tag:www.monasterium.net,2011:/charter/AbbayeDe...",,[],"LV. DE OB1TU HILDOINI ABBATIS, ET DE EXEQUII...","[ LV. , DE OB1TU HILDOINI ABBATIS, ET DE EXEQ...",[],[],[99999999],[],99999999,,,[],"[00000232.png, 00000233.png]","LV. DE OB1TU HILDOINI ABBATIS, ET DE EXEQUIIS...",True,99999999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49773,"tag:www.monasterium.net,2011:/charter/Wirtembe...",König Konrad III. bestätigt der Kirche in Spe...,[],CCCXIV. In nomine sánete et individué trinit...,"[ CCCXIV. , In nomine sánete et individué tri...",[],[],[99999999],[],99999999,,,[],"[00000042.png, 00000043.png]",CCCXIV. In nomine sánete et individué trinita...,True,99999999
49774,"tag:www.monasterium.net,2011:/charter/Wirtembe...",Kaiser Heinrich VI. nimmt das Kloster Herrena...,[],CDXCV. Chr. I In nomine sánete et individué ...,"[ CDXCV. , Chr. I In nomine sánete et individ...",[],[],[99999999],[],99999999,,,[],"[00000338.png, 00000339.png, 00000340.png]",CDXCV. Chr. I In nomine sánete et individué t...,True,99999999
49775,"tag:www.monasterium.net,2011:/charter/Wirtembe...",Bischof Günther von Speier erwirbt und übergi...,[],CCCLXVII. 1159. ! In nominç sanctç et indiv...,"[ CCCLXVII. , 1159. , ! In nominç sanctç et ...",[],[],[99999999],[],99999999,,,[],"[00000149.png, 00000150.png, 00000151.png]",CCCLXVII. 1159. ! In nominç sanctç et indivi...,True,99999999
49776,"tag:www.monasterium.net,2011:/charter/Wirtembe...","Bischof Günther von Speier verordnet, dass da...",[],CCCXXXV. 1152. j In nomine sanctç et indivi...,"[ CCCXXXV. , 1152. , j In nomine sanctç et i...",[],[],[99999999],[],99999999,,,[],"[00000085.png, 00000086.png]",CCCXXXV. 1152. j In nomine sanctç et individ...,True,99999999


In [136]:
df_prep_applied["url"] = df_prep_applied["atom_id"].apply(lambda x: chatomid_to_url(x))

In [137]:
df_final = df_prep_applied[["atom_id", "url", "date_joined", "google_ocr", "tenor"]].sort_values("date_joined")

In [141]:
df_final

Unnamed: 0,atom_id,url,date_joined,google_ocr,tenor
44160,"tag:www.monasterium.net,2011:/charter/Urkunden...",https://www.monasterium.net/mom/UrkundenBehrII...,00010101,True,"In nomine sánete et indiuidue trinitatis, Amen..."
44097,"tag:www.monasterium.net,2011:/charter/Urkunden...",https://www.monasterium.net/mom/UrkundenBehrII...,00010101,True,"In nomine domini, Amen. Otto, dei gracia dux S..."
34244,"tag:www.monasterium.net,2011:/charter/QuellenK...",https://www.monasterium.net/mom/QuellenKoelnII...,00010101,True,Acta sunt hec anno domini Incarnationis mccxxx...
34245,"tag:www.monasterium.net,2011:/charter/QuellenK...",https://www.monasterium.net/mom/QuellenKoelnII...,00010101,True,Aus einem Fascikel eines Schreinsbuches im Arc...
34246,"tag:www.monasterium.net,2011:/charter/QuellenK...",https://www.monasterium.net/mom/QuellenKoelnII...,00010101,True,•) Das voraufgehende wie das folgende Notum ha...
...,...,...,...,...,...
22421,"tag:www.monasterium.net,2011:/charter/Henneber...",https://www.monasterium.net/mom/Henneberg/4e8c...,99999999,True,GrOtfridus dei gracia episcopus Herbipolcnsis ...
22422,"tag:www.monasterium.net,2011:/charter/Henneber...",https://www.monasterium.net/mom/Henneberg/4f0f...,99999999,True,•10. nos Bertoldus dei gracia comes de Hennenb...
22415,"tag:www.monasterium.net,2011:/charter/Henneber...",https://www.monasterium.net/mom/Henneberg/4b7f...,99999999,True,S5. Wir Heinrich vnd Günther von gotis gnadin ...
49777,"tag:www.monasterium.net,2011:/charter/Wirtembe...",https://www.monasterium.net/mom/Wirtembergisch...,99999999,True,"CDVI. ! Alexander episcopus, servus servorum ..."


In [149]:
df_final.to_json("../../data/out/latin-transcriptions-ocr-bool.json")
df_final.to_xml("../../data/out/latin-transcriptions-ocr-bool.xml")