# General

## Imports

In [2]:
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
from datetime import datetime
import os
import pandas as pd
import numpy as np
import pickle
import re
from pprint import pprint
from random import sample

## Parameters

In [2]:
pd.set_option('display.max_colwidth', 100)

In [None]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = "../data/db/mom-data/metadata.charter.public"
fileExtension = ".cei.xml"

# Load .xml mapping

In [None]:
with open('../data/mapping/CEI2CSV.pkl', 'rb') as f:
    mapping = pickle.load(f)

pprint(mapping)

# Scan directory and create paths

In [None]:
def get_file_paths(directory):
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_file_paths(entry.path)
        else:
            continue

In [None]:
paths = [f"{PurePosixPath(path)}" for path in get_file_paths(directoryPath)]

In [None]:
#paths_sample = sample(paths, 30000)

# Create deques, query paths, and fill deques

In [None]:
def get_xpath_result(xpath_mapping):
    return tree.xpath(xpath_mapping, namespaces = namespaces, smart_strings = False)

In [None]:
lists = ["atom_id", "cei_abstract_joined", "cei_abstract_foreign", "cei_tenor_joined", "cei_pTenor", "cei_tenor_appLem", "cei_placeName", "cei_lang_MOM", "cei_date", "cei_dateRange", "cei_date_ATTRIBUTE_value", "cei_dateRange_ATTRIBUTE_from", "cei_dateRange_ATTRIBUTE_to", "cei_graphic_ATTRIBUTE_url_orig", "cei_graphic_ATTRIBUTE_url_copy"]
atom_id, cei_abstract_joined, cei_abstract_foreign, cei_tenor_joined, cei_pTenor, cei_tenor_appLem, cei_placeName, cei_lang_MOM, cei_date, cei_dateRange, cei_date_ATTRIBUTE_value, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy = ([] for i in range(len(lists)))

In [None]:
for file in paths:
    with open(file, "r", encoding="utf-8") as f:
        tree = etree.parse(f)
        atom_id.append(get_xpath_result(f"{mapping['atom:id']}/text()"))
        cei_abstract_joined.append("".join(get_xpath_result(f"{mapping['cei:abstract']}/descendant-or-self::node()[not(name()='cei:sup')]/text()")))
        cei_abstract_foreign.append(get_xpath_result(f"{mapping['cei:foreign']}/text()"))
        cei_tenor_joined.append("".join(get_xpath_result(f"{mapping['cei:tenor']}/descendant-or-self::node()[not(name()='cei:sup')]/text()")))
        cei_pTenor.append(get_xpath_result(f"{mapping['cei:pTenor']}/text()[not(self::cei:sup)]"))
        cei_tenor_appLem.append(get_xpath_result(f"{mapping['cei:tenor']}/descendant-or-self::node()/node()"))
        cei_placeName.append(get_xpath_result(f"{mapping['cei:issued/cei:placeName']}/text()"))
        cei_lang_MOM.append(get_xpath_result(f"{mapping['cei:lang_MOM']}/text()"))
        cei_date.append(get_xpath_result(f"{mapping['cei:date']}/text()"))
        cei_dateRange.append(get_xpath_result(f"{mapping['cei:dateRange']}/text()"))
        cei_date_ATTRIBUTE_value.append(get_xpath_result(f"{mapping['cei:date']}/@value"))
        cei_dateRange_ATTRIBUTE_from.append(get_xpath_result(f"{mapping['cei:dateRange']}/@from"))
        cei_dateRange_ATTRIBUTE_to.append(get_xpath_result(f"{mapping['cei:dateRange']}/@to"))
        cei_graphic_ATTRIBUTE_url_orig.append(get_xpath_result(f"{mapping['@url']}"))
        cei_graphic_ATTRIBUTE_url_copy.append(get_xpath_result(f"{mapping['cei:graphic/@url']}"))

# Merge deques to dataframe

In [None]:
# generate dynamic dictionary content for renaming frames
for i, item in enumerate(lists):
    print(f"{i}:'{item}',")
    i =+ 1

In [None]:
contents = list(zip(atom_id, cei_abstract_joined, cei_abstract_foreign, cei_tenor_joined, cei_pTenor, cei_placeName, cei_lang_MOM, cei_date, cei_dateRange, cei_date_ATTRIBUTE_value, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy))
contents_full = pd.DataFrame(contents).rename(
    columns={ 0:'atom_id',
1:'cei_abstract_joined',
2:'cei_abstract_foreign',
3:'cei_tenor_joined',
4:'cei_pTenor',
5:'cei_placeName',
6:'cei_lang_MOM',
7:'cei_date',
8:'cei_dateRange',
9:'cei_date_ATTRIBUTE_value',
10:'cei_dateRange_ATTRIBUTE_from',
11:'cei_dateRange_ATTRIBUTE_to',
12:'cei_graphic_ATTRIBUTE_url_orig',
13:'cei_graphic_ATTRIBUTE_url_copy'})

In [None]:
contents_full

In [None]:
test = contents_full.cei_tenor_appLem

In [None]:
timemarker = datetime.today().strftime('%Y-%m-%d-%H%M')
test.to_csv(f"../data/output/tests/test_charters_{timemarker}.csv")

# Export whole dataframe

In [None]:
timemarker = datetime.today().strftime('%Y-%m-%d-%H%M')
contents_full.to_json(f"../data/output/charters_{timemarker}.json")
contents_full.to_parquet(f"../data/output/charters_{timemarker}.parquet")

---

# Import generated data as dataframe

In [4]:
#contents_full_json = pd.read_json("../../data/output/charters_2022-08-31-2322.json")
contents_full_parquet = pd.read_parquet('../../data/output/charters_2022-09-18-2211.parquet', engine='pyarrow')

In [6]:
contents_full_json.sample(n=10)

Unnamed: 0,atom_id,cei_placeName,cei_lang_MOM,cei_tenor,cei_date,cei_date_ATTRIBUTE_value,cei_dateRange,cei_dateRange_ATTRIBUTE_from,cei_dateRange_ATTRIBUTE_to,cei_abstract,cei_abstract_foreign,cei_graphic_ATTRIBUTE_url_orig,cei_graphic_ATTRIBUTE_url_copy
611258,"[tag:www.monasterium.net,2011:/charter/OOEUB/1...",[Gemvnde (Gmunden?)],[Latein],(Chrismon.) In nomine sanctae et indiuiduae tr...,[],[],[16. Juni 1049],[10490616],[10490616],K. Heinrich III. schenkt dem Bischof Engelber ...,[],[],[]
14778,"[tag:www.monasterium.net,2011:/charter/AT-DAL/...",[Passau],[],,[1648 Juni 4.],[16480604],[],[],[],"Erzherzog Leopold Wilhelm, Bischof von Passau,...",[],"[DAL_16480604_UrkE-11_086_r.jpg, DAL_16480604_...",[]
344626,"[tag:www.monasterium.net,2011:/charter/HU-MNL-...",[],[Latin],,[1448-03-16],[14480316],[],[],[],"Herepe-i Márk, Somkerek-i Erdely Miklós és Bal...",[],[http://archives.hungaricana.hu/tile/thumb/cha...,[]
57468,"[tag:www.monasterium.net,2011:/charter/AT-KLA/...",[Prag],[],,[19.02.1547],[15470219],[],[],[],König Ferdinand bestätigt der Stadt St. Veit a...,[],"[KLA_418-B_A 2180 F_15470219_r-1.JPG, KLA_418-...",[]
353362,"[tag:www.monasterium.net,2011:/charter/HU-MNL-...",[],[],,[1477-11-25],[14771125],[],[],[],"Mátyás király Győr megye alispánjához, vagy al...",[],[http://archives.hungaricana.hu/tile/thumb/cha...,[]
611183,"[tag:www.monasterium.net,2011:/charter/Nikolaa...",[],[],7 October 1517. Kaerle willigt het verzoek i...,[ 7 October 1517. ],[00010101],[],[],[],Kaerle bij der gracie Gods coninck van Cas- t...,[],[],[00000163.png]
625943,"[tag:www.monasterium.net,2011:/charter/RIXIii/...",[Konstanz],[],,[1431 Jan. 8],[14310108],[],[],[],\n \n ...,[],[],[]
555212,"[tag:www.monasterium.net,2011:/charter/IT-ASFi...","[nella casa di Fuccio di Guzzio, nel luogo det...",[],,[],[],[1329 Maggio 23],[13290523],[13290523],,[],[],[]
535205,"[tag:www.monasterium.net,2011:/charter/IT-ASFi...",[nel chiostro di detto monastero [S. Gaudenzio]],[],,[],[],[1268 Marzo 20],[12680320],[12680320],,[],[],[]
575991,"[tag:www.monasterium.net,2011:/charter/IT-ASFi...",[Pistoia],[],,[],[],[1259 Marzo 1],[12590301],[12590301],,[],[],[]


In [6]:
contents_full_parquet

Unnamed: 0,atom_id,cei_abstract_joined,cei_abstract_foreign,cei_tenor_joined,cei_pTenor,cei_placeName,cei_lang_MOM,cei_date,cei_dateRange,cei_date_ATTRIBUTE_value,cei_dateRange_ATTRIBUTE_from,cei_dateRange_ATTRIBUTE_to,cei_graphic_ATTRIBUTE_url_orig,cei_graphic_ATTRIBUTE_url_copy
0,"[tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/0632d68f-f907-4113-8...",Fede del miracolo di S. Giovanni Battista nella Chiesa di S. Liguoro.,[],,[],[Napoli],[],[],[1578 agosto 29],[],[15780829],[15780829],[http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Baia...,[]
1,"[tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/0e3d1203-0e72-4b17-8...","Palermo de Caserta, figlio dei defunti Giovanni e Sarracina, abitante di Calvizzano, ottiene in ...",[],+In nomine domini dei salvatoris nostri Iuesu Christi. Anno ab incarnationis eius millesimo duce...,[],[Napoli],[],[],[1268 ottobre 23],[],[12681023],[12681023],[http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Baia...,[]
2,"[tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/0eb5100f-8132-48fd-a...",Privigelio di Sergio console e duca in favore del monastero di S. Liguoro.,[],,[],[Napoli],[],[],[1127 febbraio 16],[],[11270216],[11270216],[http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Baia...,[]
3,"[tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/16bfa2f5-9171-4d18-b...","Mattea Luca Caracciolo, vesc. di Lesina concede alcuni suoi beni al monastero di S. Liguoro.",[],,[],[Napoli],[],[],[1518 marzo 3],[],[15180303],[15180303],[http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Baia...,[]
4,"[tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/16c37112-ba89-4c4c-8...",Il monastero di S. Liguoro concede a Giovan Battista Grisono che venda una masseria in tenimento...,[],,[],[Napoli],[],[],[1559 settembre 11],[],[15590911],[15590911],[http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Baia...,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659165,"[tag:www.monasterium.net,2011:/charter/WirtembergischesUrkundenbuch/faf66a3f-d54b-4439-a443-65cd...",König Konrad III. bestätigt der Kirche in Speier alle von seinen Vorfahren und\n ...,[],"CCCXIV. In nomine sánete et individué trinitatis. Cunradus, divina favente clemencia Romanorum...","[ CCCXIV. , In nomine sánete et individué trinitatis. Cunradus, divina favente clemencia Romano...",[],[],[99999999],[],[99999999],[],[],[],"[00000042.png, 00000043.png]"
659166,"[tag:www.monasterium.net,2011:/charter/WirtembergischesUrkundenbuch/fb51a6c4-bde7-4df7-86c7-ee6d...",Kaiser Heinrich VI. nimmt das Kloster Herrenalb mit den durch Kauf von Eberhard\n ...,[],"CDXCV. Chr. I In nomine sánete et individué trinitatis. Heinricus sextus, divina faveutc\n ...","[ CDXCV. , Chr. I In nomine sánete et individué trinitatis. Heinricus sextus, divina faveutc\n ...",[],[],[99999999],[],[99999999],[],[],[],"[00000338.png, 00000339.png, 00000340.png]"
659167,"[tag:www.monasterium.net,2011:/charter/WirtembergischesUrkundenbuch/fc9ffa72-79b1-4029-8eb5-a535...","Bischof Günther von Speier erwirbt und übergiebt die Villa Eilfingen, unter\n ...",[],"CCCLXVII. 1159. ! In nominç sanctç et individuc trinitatis. j Gimtherus, dei gratia Spirensis...","[ CCCLXVII. , 1159. , ! In nominç sanctç et individuc trinitatis. j Gimtherus, dei gratia Spir...",[],[],[99999999],[],[99999999],[],[],[],"[00000149.png, 00000150.png, 00000151.png]"
659168,"[tag:www.monasterium.net,2011:/charter/WirtembergischesUrkundenbuch/fe45e235-cb7f-4ee3-ae89-6e10...","Bischof Günther von Speier verordnet, dass das Kloster Maulbronn aus seinem in\n ...",[],"CCCXXXV. 1152. j In nomine sanctç et individué, trinitatis. l Ego Guntherus, dei gratia sanct...","[ CCCXXXV. , 1152. , j In nomine sanctç et individué, trinitatis. l Ego Guntherus, dei gratia ...",[],[],[99999999],[],[99999999],[],[],[],"[00000085.png, 00000086.png]"


In [5]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(contents_full_parquet.iloc[1].to_string)

<bound method Series.to_string of atom_id                           [tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/0e3d1203-0e72-4b17-8...
cei_abstract_joined               Palermo de Caserta, figlio dei defunti Giovanni e Sarracina, abitante di Calvizzano, ottiene in ...
cei_abstract_foreign                                                                                                               []
cei_tenor_joined                  +In nomine domini dei salvatoris nostri Iuesu Christi. Anno ab incarnationis eius millesimo duce...
cei_pTenor                                                                                                                         []
cei_placeName                                                                                                                [Napoli]
cei_lang_MOM                                                                                                                       []
cei_date                    