# General

## Imports

In [2]:
from collections import deque
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
from datetime import datetime
import os
import pandas as pd
import pickle
import pprint
import random

## Parameters

In [3]:
pd.set_option('display.max_colwidth', 100)

In [4]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = "../data/db/mom-data/metadata.charter.public"
fileExtension = ".cei.xml"

# Create .xml mapping

In [7]:
# to do, see mapping.ipynb

# Load .xml mapping

In [5]:
with open('../data/mapping/CEI2CSV.pkl', 'rb') as f:
    mapping = pickle.load(f)

pprint.pprint(mapping)

{'@from': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:issued/cei:dateRange/@from',
 '@id': '/atom:entry/atom:content/cei:text/@id',
 '@key': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:abstract/cei:persName/@key',
 '@n': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:witnessOrig/cei:archIdentifier/cei:idno/@n',
 '@old': '/atom:entry/atom:content/cei:text/cei:body/cei:idno/@old',
 '@target': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:diplomaticAnalysis/cei:p/cei:ref/@target',
 '@to': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:issued/cei:dateRange/@to',
 '@type': '/atom:entry/atom:content/@type',
 '@url': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:witnessOrig/cei:figure/cei:graphic/@url',
 '@value': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:issued/cei:date/@value',
 'app:control': '/atom:entry/app:control',
 'app:draft': '/atom:entry/app:control/app:draft',
 'atom:author': '/atom:en

# Scan directory and create paths

In [6]:
def get_file_paths(directory):
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_file_paths(entry.path)
        else:
            continue

In [7]:
def get_file_paths_txt(directory):
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith("txt"):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_file_paths(entry.path)
        else:
            continue

In [8]:
directoryPathStrings = "../data/output/xmltostring"

In [9]:
paths = [f"{PurePosixPath(path)}" for path in get_file_paths(directoryPath)]


In [10]:
paths_sample = random.sample(paths, 10000)

In [13]:
paths_string_sample = random.sample(paths_string, 9990)

In [12]:
paths_string = [f"{PurePosixPath(path)}" for path in get_file_paths_txt(directoryPathStrings)]

# --- save xml as txt

In [99]:
for file in paths_sample:
    name = f"{Path(file).parent.parent.name}_{Path(file).parent.name}_{Path(file).name}"
    with open(file, "r", encoding="utf-8") as f:
        string = etree.tostring(etree.parse(f), encoding="unicode")
        with open(f"../data/output/xmltostring/{name}.txt", "w") as g:
            g.write(string)

# --- read from string

In [14]:
deques = deque(["atom_id", "cei_placeName", "cei_lang_MOM", "cei_tenor", "cei_date", "cei_date_ATTRIBUTE_value", "cei_dateRange", "cei_dateRange_ATTRIBUTE_from", "cei_dateRange_ATTRIBUTE_to", "cei_abstract", "cei_abstract_foreign", "cei_graphic_ATTRIBUTE_url_orig", "cei_graphic_ATTRIBUTE_url_copy"])
atom_id, cei_placeName, cei_lang_MOM, cei_tenor, cei_date, cei_date_ATTRIBUTE_value, cei_dateRange, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_abstract, cei_abstract_foreign, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy = ([] for i in range(len(deques)))

def get_xpath_result(xpath_mapping):
    return tree.xpath(xpath_mapping, namespaces = namespaces, smart_strings = False)

for file in paths_string:
    with open(file, "r", encoding="utf-8") as f:
        tree = etree.parse(f)
        atom_id.append(get_xpath_result(f"{mapping['atom:id']}/text()"))
        cei_abstract.append("".join(get_xpath_result(f"{mapping['cei:abstract']}/descendant-or-self::text()[not(self::cei:sup)]")))
        cei_abstract_foreign.append(get_xpath_result(f"{mapping['cei:foreign']}/text()"))
        cei_tenor.append("".join(get_xpath_result(f"{mapping['cei:tenor']}/descendant-or-self::text()[not(self::cei:sup)]")))
        cei_placeName.append(get_xpath_result(f"{mapping['cei:issued/cei:placeName']}/text()"))
        cei_lang_MOM.append(get_xpath_result(f"{mapping['cei:lang_MOM']}/text()"))
        cei_date.append(get_xpath_result(f"{mapping['cei:date']}/text()"))
        cei_dateRange.append(get_xpath_result(f"{mapping['cei:dateRange']}/text()"))
        cei_date_ATTRIBUTE_value.append(get_xpath_result(f"{mapping['cei:date']}/@value"))
        cei_dateRange_ATTRIBUTE_from.append(get_xpath_result(f"{mapping['cei:dateRange']}/@from"))
        cei_dateRange_ATTRIBUTE_to.append(get_xpath_result(f"{mapping['cei:dateRange']}/@to"))
        cei_graphic_ATTRIBUTE_url_orig.append(get_xpath_result(f"{mapping['@url']}"))
        cei_graphic_ATTRIBUTE_url_copy.append(get_xpath_result(f"{mapping['cei:graphic/@url']}"))

# Create deques, query paths, and fill deques

In [15]:
deques = deque(["atom_id", "cei_placeName", "cei_lang_MOM", "cei_tenor", "cei_date", "cei_date_ATTRIBUTE_value", "cei_dateRange", "cei_dateRange_ATTRIBUTE_from", "cei_dateRange_ATTRIBUTE_to", "cei_abstract", "cei_abstract_foreign", "cei_graphic_ATTRIBUTE_url_orig", "cei_graphic_ATTRIBUTE_url_copy"])
atom_id, cei_placeName, cei_lang_MOM, cei_tenor, cei_date, cei_date_ATTRIBUTE_value, cei_dateRange, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_abstract, cei_abstract_foreign, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy = ([] for i in range(len(deques)))

def get_xpath_result(xpath_mapping):
    return tree.xpath(xpath_mapping, namespaces = namespaces, smart_strings = False)

for file in paths_sample:
    with open(file, "r", encoding="utf-8") as f:
        tree = etree.parse(f)
        atom_id.append(get_xpath_result(f"{mapping['atom:id']}/text()"))
        cei_abstract.append("".join(get_xpath_result(f"{mapping['cei:abstract']}/descendant-or-self::text()[not(self::cei:sup)]")))
        cei_abstract_foreign.append(get_xpath_result(f"{mapping['cei:foreign']}/text()"))
        cei_tenor.append("".join(get_xpath_result(f"{mapping['cei:tenor']}/descendant-or-self::text()[not(self::cei:sup)]")))
        cei_placeName.append(get_xpath_result(f"{mapping['cei:issued/cei:placeName']}/text()"))
        cei_lang_MOM.append(get_xpath_result(f"{mapping['cei:lang_MOM']}/text()"))
        cei_date.append(get_xpath_result(f"{mapping['cei:date']}/text()"))
        cei_dateRange.append(get_xpath_result(f"{mapping['cei:dateRange']}/text()"))
        cei_date_ATTRIBUTE_value.append(get_xpath_result(f"{mapping['cei:date']}/@value"))
        cei_dateRange_ATTRIBUTE_from.append(get_xpath_result(f"{mapping['cei:dateRange']}/@from"))
        cei_dateRange_ATTRIBUTE_to.append(get_xpath_result(f"{mapping['cei:dateRange']}/@to"))
        cei_graphic_ATTRIBUTE_url_orig.append(get_xpath_result(f"{mapping['@url']}"))
        cei_graphic_ATTRIBUTE_url_copy.append(get_xpath_result(f"{mapping['cei:graphic/@url']}"))

# Merge deques to dataframe

In [None]:
# generate dynamic dictionary content for renaming frames
for i, item in enumerate(deques):
    print(f"{i}:'{item}',")
    i =+ 1

0:'atom_id',
1:'cei_placeName',
2:'cei_lang_MOM',
3:'cei_tenor',
4:'cei_date',
5:'cei_date_ATTRIBUTE_value',
6:'cei_dateRange',
7:'cei_dateRange_ATTRIBUTE_from',
8:'cei_dateRange_ATTRIBUTE_to',
9:'cei_abstract',
10:'cei_abstract_foreign',
11:'cei_graphic_ATTRIBUTE_url_orig',
12:'cei_graphic_ATTRIBUTE_url_copy',


In [16]:
contents = list(zip(atom_id, cei_placeName, cei_lang_MOM, cei_tenor, cei_date, cei_date_ATTRIBUTE_value, cei_dateRange, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_abstract, cei_abstract_foreign, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy))
contents_full = pd.DataFrame(contents).rename(
    columns={ 0:'atom_id',
1:'cei_placeName',
2:'cei_lang_MOM',
3:'cei_tenor',
4:'cei_date',
5:'cei_date_ATTRIBUTE_value',
6:'cei_dateRange',
7:'cei_dateRange_ATTRIBUTE_from',
8:'cei_dateRange_ATTRIBUTE_to',
9:'cei_abstract',
10:'cei_abstract_foreign',
11:'cei_graphic_ATTRIBUTE_url_orig',
12:'cei_graphic_ATTRIBUTE_url_copy' })

In [17]:
contents_full

Unnamed: 0,atom_id,cei_placeName,cei_lang_MOM,cei_tenor,cei_date,cei_date_ATTRIBUTE_value,cei_dateRange,cei_dateRange_ATTRIBUTE_from,cei_dateRange_ATTRIBUTE_to,cei_abstract,cei_abstract_foreign,cei_graphic_ATTRIBUTE_url_orig,cei_graphic_ATTRIBUTE_url_copy
0,"[tag:www.monasterium.net,2011:/charter/RIXIii/1436-12-05_2_0_11_2_0_5859_11563]",[Prag],[],,[1436 Dez. 5],[14361205],[],[],[],\n \n Sigmund\n bestätigt auf An...,[],[],[]
1,"[tag:www.monasterium.net,2011:/charter/HU-MNL-DLCSLP/IBRCSQ89/124730]",[],[Latin],,[1487-11-23],[14871123],[],[],[],"Bátori István tudatja a leleszi konventtel, hogy Beken-i Miklós nevében pert indítottak a név sz...",[],[http://archives.hungaricana.hu/tile/thumb/charters/olpic/062400/DL_062364/DL_062364_orig_r.ecw/...,[]
2,"[tag:www.monasterium.net,2011:/charter/Laureshamensis/91450cbf-fa50-4437-8464-64fee7b22ae0]",[],[],MDCCLXXXIV. Donatio Ruthardi in eodem vico. p-go in Dei nomine Ruthard dono ad sanctum Nazari...,[99999999],[99999999],[],[],[],,[],[00000322.png],[]
3,"[tag:www.monasterium.net,2011:/charter/IT-ASC/Boll/437]",[Roma],[],,[1670 IX 01],[16700901],[],[],[],"(v. Istr., 884).",[],[],[]
4,"[tag:www.monasterium.net,2011:/charter/AT-HHStA/TullnOP/1294_III_25]",[],[Deutsch],,[25. März 1294],[12940325],[],[],[],"Hugo von Houzental verzichtet zu Gunsten des Frauen Klosters zu Tulln auf alle Ansprüche, auf et...",[],"[K.._MOM-Bilddateien._~TullnOPjpgweb._~HHStA_tu_12940325.jpg, K.._MOM-Bilddateien._~TullnOPjpgwe...",[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,"[tag:www.monasterium.net,2011:/charter/CZ-SOAT/CizyStatky/770]",[V Ložnici?],[],,"[1423,květen 25.]",[14230525],[],[],[],"Král Zikmund se vzdává ve prospech Oldricha z Rožmberka odúmrtního práva na Vildštejn, veno Eliš...",[],"[SOAT-CiziStatkyTrebon_14230525_0770_r.jpg, SOAT-CiziStatkyTrebon_14230525_0770_v.jpg]",[]
9996,"[tag:www.monasterium.net,2011:/charter/AT-StiASF/StFlorianCanReg/1591_I_07]",[],[],,[],[],[7. Jänner 1591],[15910107],[15910107],"Verrechnung, was durch die 1588 bewilligte geistliche Kontribution an Schulden getilgt worden, i...",[],[],[]
9997,"[tag:www.monasterium.net,2011:/charter/AT-AES/Urkunden/3060]",[Oetting],[],,[],[],[23. November 1488],[14881123],[14881123],"Anna, Frau des Otto Aushofers zu Midling, verzichten zugunsten ihres ...",[],"[AES_14881123_ReiheD-522_v.jpg, AES_14881123_ReiheD-522_r.jpg]",[]
9998,"[tag:www.monasterium.net,2011:/charter/RhenoMosellanusII/12c9510b-3105-4a42-85d3-a2f4bd6b4649]",[],[],Ego Jubannes de Langiunawe miles aduocatus de Wal- risheim nolum fucio vniuersis quod\n ...,[99999999],[99999999],[],[],[],"Ofiglet* trot'fttn bem Stifter 3 » b a n n fîangenau, SSoaf }u 5Baf* feribfim,\n ...",[],[],"[00000236.png, 00000237.png]"


# Export dataframe

In [None]:
timemarker = datetime.today().strftime('%Y-%m-%d-%H%M')
contents_full.to_json(f"../data/output/charters_{timemarker}.json")
contents_full.to_parquet(f"../data/output/charters_{timemarker}.parquet")

# Import generated data as dataframe

In [None]:
contents_full_json = pd.read_json("../data/output/x.json")
contents_full_parquet = pd.read_parquet('../data/output/x.parquet', engine='pyarrow')

# check structure of pickle

In [4]:
with open('/home/atzenhofer/data/didip/tmp/data/leech/url2path_idx.pickle', 'rb') as f:
    map = pickle.load(f)
pprint.pprint(map)

{'https://www.monasterium.net/mom/AT-ADG/AGDK/2-5-3/charter': './tmp/data/leech/WhereToStore/AT-ADG/AGDK/cded030ccff923d505f3d6b305c7000f',
 'https://www.monasterium.net/mom/AT-ADG/AGDK/97-23-6/charter': './tmp/data/leech/WhereToStore/AT-ADG/AGDK/793854dfc42db8230321b0ba74b61359',
 'https://www.monasterium.net/mom/AT-ADG/AGDK/X-3-7/charter': './tmp/data/leech/WhereToStore/AT-ADG/AGDK/440903d445b953b6ac1d057b5d3a5fb3',
 'https://www.monasterium.net/mom/AT-ADG/AGDK/Zweinitz_18/charter': './tmp/data/leech/WhereToStore/AT-ADG/AGDK/516cade94d3b1b7c06da5b90bcc8aafc',
 'https://www.monasterium.net/mom/AT-AES/Urkunden/1080/charter': './tmp/data/leech/WhereToStore/AT-AES/Urkunden/df9864829a249bf0edb2f8fd68023a7f',
 'https://www.monasterium.net/mom/AT-AES/Urkunden/1704/charter': './tmp/data/leech/WhereToStore/AT-AES/Urkunden/e638ee40525ee5cdd31dd78b187ad904',
 'https://www.monasterium.net/mom/AT-AES/Urkunden/2217/charter': './tmp/data/leech/WhereToStore/AT-AES/Urkunden/f62b10e50b50c5a0e87aafc8d9