In [1]:
import pandas as pd
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
import os
from tqdm import tqdm

from ddp_util import decompose_chatomid
from ddp_util import chatomid_to_url

# Helpers

In [2]:
def explode_columns(dataframe, columns=None):
    """Explodes one or more pandas columns in a DataFrame so each row contains only one object.
    """
    all_columns = dataframe.keys().to_list()
    if columns is None:
        columns = all_columns
    for column in columns:
        dataframe = dataframe.explode(column)
    return dataframe

def get_cei_date_value(row):
    date = row["cei_date_ATTRIBUTE_value"]
    date_bot = row["cei_dateRange_ATTRIBUTE_from"]
    date_top = row["cei_dateRange_ATTRIBUTE_to"]
    value = next((d for d in (date, date_bot, date_top) if pd.notna(d)), None)
    return value

def is_valid_date(date_value):
    return isinstance(date_value, str) and ("9999" not in date_value and "010101" not in date_value)

# Load

In [3]:
df_raw = pd.read_json("../../data/output/charters_full_2022-11-22-1044.json")

In [4]:
# from xquery
df_transcriptions = pd.read_xml("../../data/in/latin-ocr-transcriptions/latin_tenor_result.xml")
df_transcriptions.columns = ["atom_id", "tenor"]
df_transcriptions["google_ocr"] = True

# Prepare main

In [5]:
df_prep = explode_columns(
    df_raw, ["atom_id","cei_date_ATTRIBUTE_value", "cei_dateRange_ATTRIBUTE_from", "cei_dateRange_ATTRIBUTE_to"]
)

df_prep = df_prep[~df_prep["atom_id"].astype(str).str.contains("cheim")]

# Which forms of Latin are there in the cei?

In [None]:
lang_list_prep = explode_columns(df_raw, ["cei_lang_MOM"])
lang_list = lang_list_prep.cei_lang_MOM.value_counts().to_frame().sort_values(by="cei_lang_MOM", ascending=False)
lang_list.reset_index()["index"].to_list()

In [None]:
included_languages = [
    "Latein",
    "latein",
    "Lat.",
    "lat.",
    "latinsky",
    "latinski",
    "Latin",
    "latin",
    "latinščina"
    ]

['Deutsch', 'Latin', 'dt.', 'Latein', 'lat.', 'latinsky', 'latinský', 'německy', 'deutsch', 'česky', 'nem.', 'ger', 'nemški', 'lateinisch', 'latein.', 'nemščina', 'lat. ', 'němčina', 'český', 'Német', 'nem. ', 'lat', 'latina', 'německý', 'latinščina', 'latinski', 'niederdeutsch', 'latein', 'Niederdeutsch', 'čeština', 'russisch', 'schwedisch', 'Ndt.', 'Latino', 'Lateinisch', 'Latin.', 'Hochdeutsch', 'Hdt.', 'čes.', 'dt', 'D', 'französisch', 'LAT', 'nemecký', 'j. łaciński', 'latinsky/česky', 'dt. und lat.', 'Deutsch [?]', 'česky, německy', 'talijanski i latinski', ' Latein', 'plattdeutsch', 'Französisch', 'Altserbisch.', 'francouzsky', 'lat. u. dt.', 'latinšcina', 'latinski i talijanski', 'Slavic.', 'German', 'Olasz', 'CZ', 'Latină', 'lateinisch und deutsch', 'italsky', 'latinsky, německy', 'fre', 'nemšcina', 'lat., dt.', 'něm.', 'dt. u. lat.', 'česky, latinsky', 'englisch', 'slov.', 'Deutsch, Latein', 'latinsky, (italsky)', 'německy, česky', 'Perg.', 'Français', 'F', 'română (grafie chi

# Which collections contain OCR material?

In [48]:
namespaces = {"atom": "http://www.w3.org/2005/Atom", "cei": "http://www.monasterium.net/NS/cei", "xrx": "http://www.monasterium.net/NS/xrx"}
directoryPath = "../../data/db/mom-data/metadata.collection.public"
fileExtension = ".cei.xml"

In [49]:
def get_file_paths(directory):
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_file_paths(entry.path)
        else:
            continue

In [50]:
paths = [f"{PurePosixPath(path)}" for path in get_file_paths(directoryPath)]

In [51]:
lists = ["atom_id", "sourceDesc_p"]
atom_id, sourceDesc_p = ([] for i in range(len(lists)))

In [52]:
for file in tqdm(paths):
    with open(file, "r", encoding="utf-8") as f:
        tree = etree.parse(f)
        atom_id.append(tree.xpath("/atom:entry/atom:id/text()", namespaces = namespaces, smart_strings = False))
        sourceDesc_p.append(tree.xpath("/atom:entry/atom:content/cei:cei/cei:teiHeader/cei:fileDesc/cei:sourceDesc/cei:p/text()", namespaces = namespaces, smart_strings = False))

100%|██████████| 195/195 [00:02<00:00, 71.50it/s]


In [53]:
contents = list(zip(atom_id, sourceDesc_p))
contents_full = pd.DataFrame(contents).rename(columns={0: "atom_id", 1: "sourceDesc_p"})

In [54]:
contents_full_exploded = explode_columns(contents_full, lists)

In [64]:
list(set(contents_full_exploded.sourceDesc_p.to_list()))

[nan,
 'Export aus Augias-DB mit Hilfe von MS-VBA (22.04.2008 11:36:10)',
 'Daten aus dem Projekt ALIM (http://www.alim.dfll.univr.it/)',
 'Export aus Augias-DB mit Hilfe von MS-VBA (09.05.2008 14:50:45)',
 'Export aus Augias-DB mit Hilfe von MS-VBA (10.03.2009 17:31:50)',
 'Export aus Augias-DB mit Hilfe von MS-VBA (03.07.2008 11:13:42)',
 'Export aus Augias-DB mit Hilfe von MS-VBA (10.03.2009 17:32:16)',
 'Export aus Google Daten',
 'Export aus Augias-DB mit Hilfe von MS-VBA (10.03.2009 17:32:02)',
 'Export aus Augias-DB mit Hilfe von MS-VBA (10.03.2009 17:31:33)',
 'Export aus Augias-DB mit Hilfe von MS-VBA (22.04.2008 11:46:51)',
 'Export aus Augias-DB mit Hilfe von MS-VBA (22.04.2008 11:44:39)',
 'Export aus Augias-DB mit Hilfe von MS-VBA (22.04.2008 11:55:34)',
 'Export aus Augias-DB mit Hilfe von MS-VBA (30.10.2009 12:55:55)',
 'Export aus Augias-DB mit Hilfe von MS-VBA (09.12.2008 17:39:15)',
 'Export aus Augias-DB mit Hilfe von MS-VBA (10.03.2009 17:32:41)',
 'Export aus Augia

In [78]:
contents_full_exploded[contents_full_exploded["sourceDesc_p"] != ("Export aus Google Daten" or pd.np.nan())]

Unnamed: 0,atom_id,sourceDesc_p
5,"tag:www.monasterium.net,2011:/collection/AFM",
6,"tag:www.monasterium.net,2011:/collection/AggOCart",Export aus Augias-DB mit Hilfe von MS-VBA (22....
7,"tag:www.monasterium.net,2011:/collection/ArdCan",Export aus Augias-DB mit Hilfe von MS-VBA (22....
9,"tag:www.monasterium.net,2011:/collection/AVSPS",
10,"tag:www.monasterium.net,2011:/collection/BaumO...",Export aus Augias-DB mit Hilfe von MS-VBA (09....
13,"tag:www.monasterium.net,2011:/collection/BISANU",Export aus Augias-DB mit Hilfe von MS-VBA (03....
17,"tag:www.monasterium.net,2011:/collection/CAO",
18,"tag:www.monasterium.net,2011:/collection/CDV",
20,"tag:www.monasterium.net,2011:/collection/Cluny...",
27,"tag:www.monasterium.net,2011:/collection/Codex...",Daten aus dem Projekt ALIM (http://www.alim.df...


In [19]:
export_inclusion_list = ["Export aus Google Daten"]

In [20]:
collections_from_google = contents_full_exploded[contents_full_exploded["sourceDesc_p"].isin(export_inclusion_list)].atom_id.to_list()

In [21]:
collections_from_google

['tag:www.monasterium.net,2011:/collection/AbbayeDeSaintBertin',
 'tag:www.monasterium.net,2011:/collection/AbteiEberbach',
 'tag:www.monasterium.net,2011:/collection/AbteiSanctGallen',
 'tag:www.monasterium.net,2011:/collection/AeviSaxonici',
 'tag:www.monasterium.net,2011:/collection/AffairesPaysBas',
 'tag:www.monasterium.net,2011:/collection/AustriacoFrisingensis',
 'tag:www.monasterium.net,2011:/collection/BenedictinerAbteiWien',
 'tag:www.monasterium.net,2011:/collection/BenedictinerStiftesSeitenstetten',
 'tag:www.monasterium.net,2011:/collection/BischoefeSpeyer',
 'tag:www.monasterium.net,2011:/collection/BistumBreslau',
 'tag:www.monasterium.net,2011:/collection/BraunschweigLueneburg',
 'tag:www.monasterium.net,2011:/collection/ChroniconBenedictobur',
 'tag:www.monasterium.net,2011:/collection/CoblenzTrierI',
 'tag:www.monasterium.net,2011:/collection/CoblenzTrierII',
 'tag:www.monasterium.net,2011:/collection/CoblenzTrierIII',
 'tag:www.monasterium.net,2011:/collection/CodexB

# Which charters are contained in these collections?

In [22]:
charter_atomids_with_google_ocr = []

In [23]:
charter_atomids_list = df_prep["atom_id"].to_list()

In [24]:
for atomid in charter_atomids_list:
    _, _, collection_atomid = decompose_chatomid(atomid)
    if collection_atomid in collections_from_google:
        charter_atomids_with_google_ocr.append(atomid)

In [25]:
len(charter_atomids_with_google_ocr)

40534

In [26]:
charter_atomids_with_google_ocr

['tag:www.monasterium.net,2011:/charter/AbbayeDeSaintBertin/002b9581-0460-4278-8bbf-6696a1467183',
 'tag:www.monasterium.net,2011:/charter/AbbayeDeSaintBertin/00740c11-2ad6-4676-bb49-adddc6c9358f',
 'tag:www.monasterium.net,2011:/charter/AbbayeDeSaintBertin/01259d0d-c5a1-4572-89b8-dff3147dc882',
 'tag:www.monasterium.net,2011:/charter/AbbayeDeSaintBertin/01d92481-d620-4ca7-8523-5775401e9403',
 'tag:www.monasterium.net,2011:/charter/AbbayeDeSaintBertin/020a7b8e-790c-46aa-9dad-678d199a3bc7',
 'tag:www.monasterium.net,2011:/charter/AbbayeDeSaintBertin/02417bc7-4848-4ffe-9ae4-36115d7d5f0b',
 'tag:www.monasterium.net,2011:/charter/AbbayeDeSaintBertin/02b0cb74-bd0e-4259-be86-32ac58b9127c',
 'tag:www.monasterium.net,2011:/charter/AbbayeDeSaintBertin/0314b455-6258-4050-8560-1b0aed75f797',
 'tag:www.monasterium.net,2011:/charter/AbbayeDeSaintBertin/03403a78-302d-421d-94a5-872a47ab7daf',
 'tag:www.monasterium.net,2011:/charter/AbbayeDeSaintBertin/047d3f58-6a71-4f8d-bb63-eaa55326ae09',
 'tag:www.

# List charters with transcriptions

In [27]:
# from xquery
with open("../../data/in/latin-ocr-transcriptions/charters_varieties_latin_results.xml", "r", encoding="utf-8") as f:
    tree = etree.parse(f)

data = []

for tenor in tree.xpath("//tenor"):
    atomid = tenor.get("atomid")
    text = tenor.text
    data.append((atomid, text))

charters_with_latin_transcriptions = pd.DataFrame(data, columns=["atom_id", "tenor"])
charters_with_latin_transcriptions["google_ocr"] = False

In [28]:
charters_with_latin_transcriptions

Unnamed: 0,atom_id,tenor,google_ocr
0,"tag:www.monasterium.net,2011:/charter/AT-StiAG...","In n. s.\n et i. T. Quoniam, vt ait Apostol...",False
1,"tag:www.monasterium.net,2011:/charter/AT-StiAG...",Ego\n Hermannus prepositus ecclesie S. Petr...,False
2,"tag:www.monasterium.net,2011:/charter/AT-StiAG...","In n. s. et\n i. T. Ego Zmilo de Bilcowe, e...",False
3,"tag:www.monasterium.net,2011:/charter/AT-StiAG...",Nos\n Johannes dei gracia Olumucensis episc...,False
4,"tag:www.monasterium.net,2011:/charter/AT-StiAG...",Nos\n Fridericus dei gracia dux Austrie et ...,False
...,...,...,...
9280,"tag:www.monasterium.net,2011:/charter/HR-HDA/M...",† Anno ab\n incarnatione domini nostri Iesu...,False
9281,"tag:www.monasterium.net,2011:/charter/HR-HDA/M...",[In]\n Christi nom[ine] et eiusdem anno inc...,False
9282,"tag:www.monasterium.net,2011:/charter/HR-HDA/M...","† In\n Christi nomine. Stephanus ego, nutu ...",False
9283,"tag:www.monasterium.net,2011:/charter/HR-HDA/M...",† In nomine\n dei eterni. Future recordatio...,False


In [38]:
# check if there is overlap
charters_with_latin_transcriptions[charters_with_latin_transcriptions["atom_id"].isin(charter_atomids_with_google_ocr)]

# overlap_list = []
# for i in charters_with_latin_transcriptions.atom_id:
#     if i in charter_atomids_with_google_ocr:
#         overlap_list.append(i)
# len(overlap_list)

Unnamed: 0,atom_id,tenor,google_ocr


In [39]:
# check duplicates
df_full = pd.concat([df_transcriptions, charters_with_latin_transcriptions])
print(len(df_full))
lentest = df_full.atom_id.drop_duplicates(keep="first")
print(len(lentest))

49784
49784


In [40]:
# merge with full
df_merged = df_prep.merge(df_full, on="atom_id")

In [41]:
df_prep_applied = df_merged.copy()

In [42]:
df_prep_applied["date_joined"] = df_prep_applied.apply(get_cei_date_value, axis=1)
#df_prep_filtered = df_prep_applied.loc[df_prep_applied["date_joined"].apply(is_valid_date)]

In [47]:
for i in df_prep_applied[df_prep_applied["google_ocr"] == False].atom_id.sample(n=5).to_list():
    print(chatomid_to_url(i))

https://www.monasterium.net/mom/CSGIII/Nr_921_S_38-40/charter
https://www.monasterium.net/mom/CSGIV/1275_I_02/charter
https://www.monasterium.net/mom/DE-AKR/Urkunden/12550516/charter
https://www.monasterium.net/mom/CodexDiplomaticusCavensis/0299/charter
https://www.monasterium.net/mom/AT-StiAK/KlosterneuburgCanReg/1397_VIII_28/charter


In [136]:
df_prep_applied["url"] = df_prep_applied["atom_id"].apply(lambda x: chatomid_to_url(x))

In [137]:
df_final = df_prep_applied[["atom_id", "url", "date_joined", "google_ocr", "tenor"]].sort_values("date_joined")

In [141]:
df_final

Unnamed: 0,atom_id,url,date_joined,google_ocr,tenor
44160,"tag:www.monasterium.net,2011:/charter/Urkunden...",https://www.monasterium.net/mom/UrkundenBehrII...,00010101,True,"In nomine sánete et indiuidue trinitatis, Amen..."
44097,"tag:www.monasterium.net,2011:/charter/Urkunden...",https://www.monasterium.net/mom/UrkundenBehrII...,00010101,True,"In nomine domini, Amen. Otto, dei gracia dux S..."
34244,"tag:www.monasterium.net,2011:/charter/QuellenK...",https://www.monasterium.net/mom/QuellenKoelnII...,00010101,True,Acta sunt hec anno domini Incarnationis mccxxx...
34245,"tag:www.monasterium.net,2011:/charter/QuellenK...",https://www.monasterium.net/mom/QuellenKoelnII...,00010101,True,Aus einem Fascikel eines Schreinsbuches im Arc...
34246,"tag:www.monasterium.net,2011:/charter/QuellenK...",https://www.monasterium.net/mom/QuellenKoelnII...,00010101,True,•) Das voraufgehende wie das folgende Notum ha...
...,...,...,...,...,...
22421,"tag:www.monasterium.net,2011:/charter/Henneber...",https://www.monasterium.net/mom/Henneberg/4e8c...,99999999,True,GrOtfridus dei gracia episcopus Herbipolcnsis ...
22422,"tag:www.monasterium.net,2011:/charter/Henneber...",https://www.monasterium.net/mom/Henneberg/4f0f...,99999999,True,•10. nos Bertoldus dei gracia comes de Hennenb...
22415,"tag:www.monasterium.net,2011:/charter/Henneber...",https://www.monasterium.net/mom/Henneberg/4b7f...,99999999,True,S5. Wir Heinrich vnd Günther von gotis gnadin ...
49777,"tag:www.monasterium.net,2011:/charter/Wirtembe...",https://www.monasterium.net/mom/Wirtembergisch...,99999999,True,"CDVI. ! Alexander episcopus, servus servorum ..."


In [149]:
df_final.to_json("../../data/out/latin-transcriptions-ocr-bool.json")
df_final.to_xml("../../data/out/latin-transcriptions-ocr-bool.xml")