# General

## Imports

In [1]:
from collections import deque
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
from datetime import datetime
import os
import pandas as pd
import pickle
import pprint
import random

## Parameters

In [2]:
pd.set_option('display.max_colwidth', 100)

In [4]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = "../data/db/mom-data/metadata.charter.public"
fileExtension = ".cei.xml"

# Create .xml mapping

In [None]:
# to do, see mapping.ipynb

# Load .xml mapping

In [5]:
with open('../data/mapping/CEI2CSV.pkl', 'rb') as f:
    mapping = pickle.load(f)

pprint.pprint(mapping)

{'@from': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:issued/cei:dateRange/@from',
 '@id': '/atom:entry/atom:content/cei:text/@id',
 '@key': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:abstract/cei:persName/@key',
 '@n': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:witnessOrig/cei:archIdentifier/cei:idno/@n',
 '@old': '/atom:entry/atom:content/cei:text/cei:body/cei:idno/@old',
 '@target': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:diplomaticAnalysis/cei:p/cei:ref/@target',
 '@to': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:issued/cei:dateRange/@to',
 '@type': '/atom:entry/atom:content/@type',
 '@url': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:witnessOrig/cei:figure/cei:graphic/@url',
 '@value': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:issued/cei:date/@value',
 'app:control': '/atom:entry/app:control',
 'app:draft': '/atom:entry/app:control/app:draft',
 'atom:author': '/atom:en

# Scan directory and create paths

In [6]:
def get_file_paths(directory):
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_file_paths(entry.path)
        else:
            continue

In [7]:
paths = [f"{PurePosixPath(path)}" for path in get_file_paths(directoryPath)]

In [25]:
paths_sample = random.sample(paths, 1000)

# Create deques, query paths, and fill deques

In [28]:
deques = deque(["atom_id", "cei_placeName", "cei_lang_MOM", "cei_tenor", "cei_date", "cei_date_ATTRIBUTE_value", "cei_dateRange", "cei_dateRange_ATTRIBUTE_from", "cei_dateRange_ATTRIBUTE_to", "cei_abstract", "cei_abstract_foreign", "cei_graphic_ATTRIBUTE_url_orig", "cei_graphic_ATTRIBUTE_url_copy"])
atom_id, cei_placeName, cei_lang_MOM, cei_tenor, cei_date, cei_date_ATTRIBUTE_value, cei_dateRange, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_abstract, cei_abstract_foreign, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy = ([] for i in range(len(deques)))

def get_xpath_result(xpath_mapping):
    return tree.xpath(xpath_mapping, namespaces = namespaces, smart_strings = False)

for file in paths_sample:
    with open(file, "r", encoding="utf-8") as f:
        tree = etree.parse(file)
        atom_id.append(get_xpath_result(f"{mapping['atom:id']}/text()"))
        cei_abstract.append("".join(get_xpath_result(f"{mapping['cei:abstract']}/descendant-or-self::text()[not(self::cei:sup)]")))
        cei_abstract_foreign.append(get_xpath_result(f"{mapping['cei:foreign']}/text()"))
        cei_tenor.append("".join(get_xpath_result(f"{mapping['cei:tenor']}/descendant-or-self::text()[not(self::cei:sup)]")))
        cei_placeName.append(get_xpath_result(f"{mapping['cei:issued/cei:placeName']}/text()"))
        cei_lang_MOM.append(get_xpath_result(f"{mapping['cei:lang_MOM']}/text()"))
        cei_date.append(get_xpath_result(f"{mapping['cei:date']}/text()"))
        cei_dateRange.append(get_xpath_result(f"{mapping['cei:dateRange']}/text()"))
        cei_date_ATTRIBUTE_value.append(get_xpath_result(f"{mapping['cei:date']}/@value"))
        cei_dateRange_ATTRIBUTE_from.append(get_xpath_result(f"{mapping['cei:dateRange']}/@from"))
        cei_dateRange_ATTRIBUTE_to.append(get_xpath_result(f"{mapping['cei:dateRange']}/@to"))
        cei_graphic_ATTRIBUTE_url_orig.append(get_xpath_result(f"{mapping['@url']}"))
        cei_graphic_ATTRIBUTE_url_copy.append(get_xpath_result(f"{mapping['cei:graphic/@url']}"))

# Merge deques to dataframe

In [11]:
# generate dynamic dictionary content for renaming frames
for i, item in enumerate(deques):
    print(f"{i}:'{item}',")
    i =+ 1

0:'atom_id',
1:'cei_placeName',
2:'cei_lang_MOM',
3:'cei_tenor',
4:'cei_date',
5:'cei_date_ATTRIBUTE_value',
6:'cei_dateRange',
7:'cei_dateRange_ATTRIBUTE_from',
8:'cei_dateRange_ATTRIBUTE_to',
9:'cei_abstract',
10:'cei_abstract_foreign',
11:'cei_graphic_ATTRIBUTE_url_orig',
12:'cei_graphic_ATTRIBUTE_url_copy',


In [12]:
contents = list(zip(atom_id, cei_placeName, cei_lang_MOM, cei_tenor, cei_date, cei_date_ATTRIBUTE_value, cei_dateRange, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_abstract, cei_abstract_foreign, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy))
contents_full = pd.DataFrame(contents).rename(
    columns={ 0:'atom_id',
1:'cei_placeName',
2:'cei_lang_MOM',
3:'cei_tenor',
4:'cei_date',
5:'cei_date_ATTRIBUTE_value',
6:'cei_dateRange',
7:'cei_dateRange_ATTRIBUTE_from',
8:'cei_dateRange_ATTRIBUTE_to',
9:'cei_abstract',
10:'cei_abstract_foreign',
11:'cei_graphic_ATTRIBUTE_url_orig',
12:'cei_graphic_ATTRIBUTE_url_copy' })

In [13]:
contents_full

Unnamed: 0,atom_id,cei_placeName,cei_lang_MOM,cei_tenor,cei_date,cei_date_ATTRIBUTE_value,cei_dateRange,cei_dateRange_ATTRIBUTE_from,cei_dateRange_ATTRIBUTE_to,cei_abstract,cei_abstract_foreign,cei_graphic_ATTRIBUTE_url_orig,cei_graphic_ATTRIBUTE_url_copy
0,"[tag:www.monasterium.net,2011:/charter/AT-DAL/Urkunden/Urkunden_PA%28Sarleinsbach%29]",[Rom],[],,[1706 Oktober 15.],[17061015],[],[],[],"Johann Ignaz Jazlauer, Pfarrer von Sarleinsbach, wird zum öfentlichen Notar ernannt.",[],[],[]
1,"[tag:www.monasterium.net,2011:/charter/HU-MNL-DLCSLA/VAYCSBERQ203/128040]",[],[],,[1494-09-23],[14940923],[],[],[],KINIZSI PÁL ORSZÁGBÍRÓ,[],[http://archives.hungaricana.hu/tile/thumb/charters/olpic/097600/DL_097523/DL_097523_01.ecw/?h=500],[]
2,"[tag:www.monasterium.net,2011:/charter/AT-NOeLA/HA_Seefeld-HardeggerUrk/Hardegger_Urk_1533]",[Wien],[],,[],[],[1642 April 23.],[16420423],[16420423],"Die n.ö. Regierung teilt dem Grafen Julius zu Hardegg mit, dass Seine Majestät sich gnädigst res...",[],"[NÍLA-HA-Seefeld_Hardegger-Urk_1533r.jpg, NÍLA-HA-Seefeld_Hardegger-Urk_1533v.jpg]",[]
3,"[tag:www.monasterium.net,2011:/charter/AggOCart/1370_X_08]",[],[Deutsch],,[8. Oktober 1370],[13701008],[],[],[],"Hainreich der Huglingêr verschreibt mit des Lehensherrn Handen, des Herrn Haydenreich von Meissa...","[Hainreich der Huglingêr, Haydenreich von Meissaw, Elspet, (gelts), u, bertewrung, u, bertewrung...",[],[]
4,"[tag:www.monasterium.net,2011:/charter/HU-MNL-DLCSLP/ESTCSQ67/48927]",[],[],,[1388-06-20],[13880620],[],[],[],ZSIGMOND KIRÁLY,[],[http://archives.hungaricana.hu/tile/thumb/charters/olpic/087600/DL_087584/DL_087584_ceteri.ecw/...,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"[tag:www.monasterium.net,2011:/charter/AT-HHStA/SbgE/AUR_1423-1448.28]",[],[],,[],[],[14. Februar 1447],[14470214],[14470214],"Schreiben des Stadtrichters Jörg Schöttel betreffend die Untersuchung eines Todschlags, der von ...",[],"[HHSTA_Salzburg_1423-1448_00_00-6.jpg, HHSTA_Salzburg_1423-1448_00_00-6v.jpg]",[]
96,"[tag:www.monasterium.net,2011:/charter/AFM/1.5.608]",[],[],[23. Dezember 1637] Die 23. Mag. Nicolaus Hoff man pro prima laurea\n theses de sympt...,[23. Dezember 1637],[16371223],[],[],[],Sitzung vom 23. Dezember 1637,[],[],[]
97,"[tag:www.monasterium.net,2011:/charter/SK-SNA/LErd/79]",[],[latinský],,[11.12.1498],[14981211],[],[],[],Jágerská kapitula podáva reláciu na mandát palatína Štefan Zápoľského týkajúcu sa vyšetrovania r...,[],"[1111_00484_00024778_0001_1.jpg, 1111_00484_00024779_0002_1.jpg]",[]
98,"[tag:www.monasterium.net,2011:/charter/UrkundenBehrI/e066d078-12e8-4dbf-88ad-07b7d6d05bd9]",[],[],"In nomine domini, Amen. Otto dei gratia dux etc., neenon Wartislaus eiusdem\n ...",[ D. d. 1320. Aug. 23. ],[00010101],[],[],[],"Die Herzoge Otto, Wartislav und Barnim verleihen den Städten Stettin,\n Piri...",[],[],"[00000505.png, 00000506.png, 00000507.png]"


# Export dataframe

In [14]:
timemarker = datetime.today().strftime('%Y-%m-%d-%H%M')
contents_full.to_json(f"../data/output/charters_{timemarker}.json")
contents_full.to_parquet(f"../data/output/charters_{timemarker}.parquet")

# Import generated data as dataframe

In [None]:
contents_full_json = pd.read_json("../data/output/x.json")
contents_full_parquet = pd.read_parquet('../data/output/x.parquet', engine='pyarrow')