# Transformation of cei.xml into tabular data (monasterium-files)

# 👑 get atomID and graphicURL into .csv from (all) collections ✔
- includes subdirectories
- fast, still takes several minutes for whole mom-dump

In [None]:
from pathlib import Path
from pathlib import PurePath
import xml.etree.ElementTree as ET
import pandas as pd

In [None]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = '\\\?/'+'C://Users/atzenhof/playground/GitHub/didip/data/db/mom-data/metadata.charter.public/' # escape needed for subdirectory paths longer than system allows for
fileExtension = ('*.cei.xml')

In [None]:
atomIDs = []
attributes = []

for file in Path(directoryPath).rglob(fileExtension):
    tree = ET.parse(file) # parsing does not consider utf-8 specific symbols (such as long dash); needs postcorrection
    root = tree.getroot()
    for tag in root.findall('.//cei:graphic', namespaces):
        atomID = root[0].text
        atomIDs.append(atomID)
        if 'url' in tag.attrib:
            attribute = tag.attrib['url'] 
            attributes.append(attribute)
        else:
            continue

In [None]:
img_list = list(zip(atomIDs, attributes))
df = pd.DataFrame(img_list).rename(columns={0: 'atomID', 1: 'url'})
df

In [None]:
pathname = PurePath(directoryPath).name
df.to_csv(f'../data/output/{pathname}.csv', index=False)
#df_links = df[df['url'].str.contains("http://")]
#df_links.to_csv(f'links_{pathname}.csv', index=False)

# 👑 transform referential img-.xml file(s) into csv using lxml ✔

In [None]:
from pathlib import PurePath
from pathlib import Path
from lxml import etree #lxml since xml.etree.ElementTree does not have full xpath support (no getparent() after using find/findall())
import pandas as pd

In [None]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = '\\\?/'+'C://Users/atzenhof/playground/GitHub/didip/data/images_xml' # path escape due to long path
fileExtension = ('*.xml')

In [None]:
atomIDs = []
image_links = []

for file in Path(directoryPath).rglob(fileExtension):
    tree = etree.parse(str(file)) # requres conversion to str since lxml does not vibe with windowspath
    root = tree.getroot()
    for img in root.findall('.//img', namespaces):
        atomID = img.getparent().attrib['id']
        atomIDs.append(atomID)
        image_link = img.attrib['src']
        image_links.append(image_link)

In [None]:
img_list = list(zip(atomIDs, image_links))
df = pd.DataFrame(img_list).rename(columns={0: 'atomID', 1: 'url'})
df

In [None]:
pathname = PurePath(directoryPath).name
df.to_csv(f'../data/output/{pathname}.csv', index=False)

# lists atomIDs of all charters in folder [mom-dump] including images
## to do
- add exception for image link (if it is empty)

In [None]:
from pathlib import Path
from pathlib import PurePath
import xml.etree.ElementTree as ET
import pandas as pd

In [None]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = '\\\?/'+'C://Users/atzenhof/playground/GitHub/didip/data/db_subset_for_test' # escape needed for subdirectory paths longer than system allows for
fileExtension = ('*.cei.xml')

In [None]:
print(directoryPath)

In [None]:
atomIDs = []
attributes = []

for file in Path(directoryPath).rglob(fileExtension):
    tree = ET.parse(file) # parsing does not consider utf-8 specific symbols (such as long dash); needs postcorrection
    root = tree.getroot()
    for tag in root.findall('.//cei:graphic', namespaces):
        atomID = root[0].text
        atomIDs.append(atomID)
        if 'url' in tag.attrib:
            attribute = tag.attrib['url'] 
            attributes.append(attribute)
        else:
            attribute = ""
            attributes.append(attribute)

In [None]:
img_list = list(zip(atomIDs, attributes))
df = pd.DataFrame(img_list).rename(columns={0: 'atomID', 1: 'url'})
df

bei else: continue .. 12406 rows

bei else: attribute = "" .. 12405 rows

In [None]:
pathname = PurePath(directoryPath).name
df.to_csv(f'../data/output/{pathname}.csv', index=False)

df_noURLs = df[df['url'].astype(bool)]

df_noURLs.to_csv(f'../data/output/{pathname}_nourls.csv'
    f'{pathname}_noURLs.csv', index=False)

# filter by image at monasterium

In [None]:
import pandas as pd

#df_links = df[df['url'].str.contains("http://")]
#df_links.to_csv(f'links_{pathname}.csv', index=False)

In [None]:
df = pd.read_csv('../data/output/metadata.charter.public.csv')

In [None]:
df

In [None]:
df_withMomLinks = df[df['url'].str.contains("images.monasterium.net", na=False)]
df_withMomLinks.to_csv(f'../data/output/df_withMomLinks.csv', index=False)

In [None]:
df_withMomLinks

# transform transcriptions-ref using lxml

## to do
- maybe when using xpath make them more concrete (with subdirectories) (fix redundancies in iteration; use iterator and maybe memory clearing)
- discuss distinction between pTenor and Tenor (so either there are multiple ptenors to be concatenated or there is one single tenor)
- normalize (with regex) years based on multiple date tags given in the data dump
- write dynamic code that distinguishes between tags and attributes (using dynamic list creation, making it easier to choose elements that are desired; see https://stackoverflow.com/questions/23999801/creating-multiple-lists)
- create somewhat normalized and performant mapping between element names in cei and csv/dict (e.g. tenor_content for cei:tenor/ptenor etc.)

## collection (to optimize)

In [41]:
from pathlib import PurePath
from pathlib import Path
from lxml import etree
import pandas as pd

In [42]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
#directoryPath = '\\\?/'+'C://Users/atzenhof/playground/GitHub/didip/data/db_subset_for_test/transcriptions-ref'# escape needed for subdirectory paths longer than system allows for
directoryPath = '\\\?/'+'C://Users/atzenhof/playground/GitHub/didip/data/db_subset_for_test/transcriptions-ref/AFM/'# escape needed for subdirectory paths longer than system allows for
#directoryPath = '\\\?/'+'C://Users/atzenhof/playground/GitHub/didip/data/db/mom-data/metadata.charter.public'# escape needed for subdirectory paths longer than system allows forfileExtension = ('*.cei.xml')
fileExtension = ('*.cei.xml')

In [43]:
atom_IDs = []
tenor_contents = []
pTenor_contents = []
places = []
languages = []

# tags_of_interest = ["hello", "test", "wow"]
# attributes_of_interest = []

In [44]:
#xpath lxml
for file in Path(directoryPath).rglob(fileExtension):
    tree = etree.parse(str(file))
    atom_IDs.append(tree.xpath("//atom:id/text()", namespaces = namespaces))
    places.append(tree.xpath(".//cei:placeName/text()", namespaces = namespaces))
    languages.append(tree.xpath(".//cei:lang_MOM/text()", namespaces = namespaces))
    tenor_contents.append(tree.xpath("//cei:tenor/text()", namespaces = namespaces))
    pTenor_contents.append(tree.xpath("//cei:pTenor/text()", namespaces = namespaces))

In [45]:
tenor_list = list(zip(atom_IDs, places, languages, tenor_contents, pTenor_contents))
df = pd.DataFrame(tenor_list).rename(columns={0: "atom_IDs", 1: "places", 2: "languages", 3:"tenor_contents", 4: "pTenor_contents"})

In [46]:
df

Unnamed: 0,atom_IDs,places,languages,tenor_contents,pTenor_contents
0,"[tag:www.monasterium.net,2011:/charter/AFM/1.1.1]",[],[],[],[Anno domini millesimo trecentesimo nonagesimo...
1,"[tag:www.monasterium.net,2011:/charter/AFM/1.1...",[],[],[],[Placuit doctoribus nostris omnibus et singuli...
2,"[tag:www.monasterium.net,2011:/charter/AFM/1.1...",[],[],[],"[In vigilia Corporis Christi hora 6, post cen..."
3,"[tag:www.monasterium.net,2011:/charter/AFM/1.1...",[],[],[],[In die autem Corporis Christi convenerunt sim...
4,"[tag:www.monasterium.net,2011:/charter/AFM/1.1...",[],[],[],"[Die 13, Augusti fuit\n congre..."
...,...,...,...,...,...
6032,"[tag:www.monasterium.net,2011:/charter/AFM/1.9...",[],[],[],"[Die 31. Januarij: Caecilia Sch#bachin, obstet..."
6033,"[tag:www.monasterium.net,2011:/charter/AFM/1.9...",[],[],[],[Die 1. Februarij: Examinata est Theresia Cath...
6034,"[tag:www.monasterium.net,2011:/charter/AFM/1.9...",[],[],[],[Die 13. Februarij: Maria Elisabetha Pernwarti...
6035,"[tag:www.monasterium.net,2011:/charter/AFM/1.9...",[],[],[],[Die 1. Martij: Francisca Englhartin praesenta...


In [47]:
df_explode = df.explode('atom_IDs').explode('places').explode('languages').explode('tenor_contents').explode('pTenor_contents')

In [48]:
df_explode_new = df_explode.loc[df_explode['atom_IDs'] == 'tag:www.monasterium.net,2011:/charter/AFM/1.1.1']
df_explode_new.reset_index(drop=True, inplace=True)
df_explode_new['pTenor_contents']

0    Anno domini millesimo trecentesimo nonagesimo ...
1       quidam pro actis facultatis medicine comparari
2    deberet, in quem omnes doctores, licentiati, b...
3     predicto libro inscriptos prefate\n          ...
4                                                 Item
5     in eadem congregatione\n                concl...
6     antequam ad eandem per decanum facultatis pre...
Name: pTenor_contents, dtype: object

In [28]:
pathname = PurePath(directoryPath).name
df_explode.to_csv(f'../data/output/tenor_test_explode_{pathname}.csv', index=False)

In [None]:
atomIDs = []
tenor_contents = []
places = []
languages = []
dates_custom = []
dates_normalized = []
dates_range = []
dates_from = []
dates_to = []

for file in Path(directoryPath).rglob(fileExtension):
    tree = etree.parse(str(file)) # requires conversion to str since lxml does not vibe with windowspath
    root = tree.getroot()
    for tenor in root.findall('.//cei:tenor', namespaces):
        atomIDs.append(root[0].text)
        tenor_contents.append(etree.tostring(tenor).decode('utf-8'))
        if root.find('.//cei:placeName', namespaces) is not None:
            places.append(root.find('.//cei:placeName', namespaces).text)
        elif root.find('.//cei:placeName', namespaces) is None:
            places.append(None)
        if root.find('.//cei:lang_MOM', namespaces) is not None:
            languages.append(root.find('.//cei:lang_MOM', namespaces).text)
        elif root.find('.//cei:lang_MOM', namespaces) is None:
            languages.append(None)
        if root.find('.//cei:date', namespaces) is not None:
            dates_custom.append(root.find('.//cei:date', namespaces).text)# OMG CUSTOM CEI:date IS HORRIBLE, have fun resolving VI. Kal. April. Anno dominice incarnationis M... or anno Domini Mo. CCCo. XLIIII. Ipsa die beati J...
        elif root.find('.//cei:date', namespaces) is None:
            dates_custom.append(None)
        # if root.find('.//cei:date[@value]', namespaces) is not None:
        #     dates_normalized.append(root.find('.//cei:date', namespaces).attrib['value'])
        # elif root.find('.//cei:date[@value]', namespaces) is None:
        #     dates_normalized.append(None)
        if root.find('.//cei:dateRange', namespaces) is not None:
            dates_range.append(root.find('.//cei:dateRange', namespaces).text)
        elif root.find('.//cei:dateRange', namespaces) is None:
            dates_range.append(None)
        # if root.find('.//cei:dateRange[@from]', namespaces) is not None:
        #     dates_from.append(root.find('.//cei:dateRange', namespaces).attrib['from'])
        # elif root.find('.//cei:dateRange[@from]', namespaces) is None:
        #     dates_from.append(None)
        # if root.find('.//cei:dateRange[@to]', namespaces) is not None:
        #     dates_to.append(root.find('.//cei:dateRange', namespaces).attrib['to'])
        # elif root.find('.//cei:dateRange[@to]', namespaces) is None:
        #     dates_to.append(None)

In [None]:
tenor_list = list(zip(atomIDs, tenor_contents, places, languages, dates_custom, dates_range))
df = pd.DataFrame(tenor_list).rename(columns={0: 'atomID', 1: 'tenor_content', 2: 'place', 3: 'language', 4: 'dates_custom', 5: 'dates_normalized', 6: 'dates_range', 7: 'dates_from', 8: 'dates_to'})
df

## collection ✔

In [2]:
from pathlib import PurePath
from pathlib import Path
from lxml import etree #lxml since xml.etree.ElementTree does not have full xpath support (no getparent() after using find/findall())
import pandas as pd

In [None]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
#directoryPath = '\\\?/'+'C://Users/atzenhof/playground/GitHub/didip/data/db_subset_for_test/transcriptions-ref'# escape needed for subdirectory paths longer than system allows for
directoryPath = '\\\?/'+'C://Users/atzenhof/playground/GitHub/didip/data/db/mom-data/metadata.charter.public'# escape needed for subdirectory paths longer than system allows forfileExtension = ('*.cei.xml')
fileExtension = ('*.cei.xml')

In [None]:
# takes 10min+
atomIDs = []
tenor_contents = []
places = []
languages = []
dates_custom = []
dates_normalized = []
dates_range = []
dates_from = []
dates_to = []

for file in Path(directoryPath).rglob(fileExtension):
    tree = etree.parse(str(file)) # requires conversion to str since lxml does not vibe with windowspath
    root = tree.getroot()
    for tenor in root.findall('.//cei:tenor', namespaces):
        atomIDs.append(root[0].text)
        tenor_contents.append(etree.tostring(tenor).decode('utf-8'))
        if root.find('.//cei:placeName', namespaces) is not None:
            places.append(root.find('.//cei:placeName', namespaces).text)
        elif root.find('.//cei:placeName', namespaces) is None:
            places.append(None)
        if root.find('.//cei:lang_MOM', namespaces) is not None:
            languages.append(root.find('.//cei:lang_MOM', namespaces).text)
        elif root.find('.//cei:lang_MOM', namespaces) is None:
            languages.append(None)
        if root.find('.//cei:date', namespaces) is not None:
            dates_custom.append(root.find('.//cei:date', namespaces).text)# OMG CUSTOM CEI:date IS HORRIBLE, have fun resolving VI. Kal. April. Anno dominice incarnationis M... or anno Domini Mo. CCCo. XLIIII. Ipsa die beati J...
        elif root.find('.//cei:date', namespaces) is None:
            dates_custom.append(None)
        # if root.find('.//cei:date[@value]', namespaces) is not None:
        #     dates_normalized.append(root.find('.//cei:date', namespaces).attrib['value'])
        # elif root.find('.//cei:date[@value]', namespaces) is None:
        #     dates_normalized.append(None)
        if root.find('.//cei:dateRange', namespaces) is not None:
            dates_range.append(root.find('.//cei:dateRange', namespaces).text)
        elif root.find('.//cei:dateRange', namespaces) is None:
            dates_range.append(None)
        # if root.find('.//cei:dateRange[@from]', namespaces) is not None:
        #     dates_from.append(root.find('.//cei:dateRange', namespaces).attrib['from'])
        # elif root.find('.//cei:dateRange[@from]', namespaces) is None:
        #     dates_from.append(None)
        # if root.find('.//cei:dateRange[@to]', namespaces) is not None:
        #     dates_to.append(root.find('.//cei:dateRange', namespaces).attrib['to'])
        # elif root.find('.//cei:dateRange[@to]', namespaces) is None:
        #     dates_to.append(None)

In [None]:
#error für date value attribut ab AT-KLA
#error für dateRange from attribut ab AT-StiAR
#error für dateRange to attribut ab AT-StiAR

In [None]:
tenor_list = list(zip(atomIDs, tenor_contents, places, languages, dates_custom, dates_range))
df = pd.DataFrame(tenor_list).rename(columns={0: 'atomID', 1: 'tenor_content', 2: 'place', 3: 'language', 4: 'dates_custom', 5: 'dates_normalized', 6: 'dates_range', 7: 'dates_from', 8: 'dates_to'})
df

In [None]:
pathname = PurePath(directoryPath).name
df.to_csv(f'../data/output/transcriptions_{pathname}.csv', index=False)

In [3]:
df = pd.read_csv('../data/output/transcriptions_metadata.charter.public.csv')

In [121]:
df_language = df_new.loc[df['language'] == 'Latein']
df_language.reset_index(drop=True, inplace=True)
df_language

NameError: name 'df_new' is not defined

In [8]:
df

Unnamed: 0,atomID,tenor_content,place,language,dates_custom,dates_normalized
0,"tag:www.monasterium.net,2011:/charter/069622fc...","<cei:tenor xmlns:cei=""http://www.monasterium.n...",Napoli,,,1578 agosto 29
1,"tag:www.monasterium.net,2011:/charter/069622fc...","<cei:tenor xmlns:cei=""http://www.monasterium.n...",Napoli,,,1268 ottobre 23
2,"tag:www.monasterium.net,2011:/charter/069622fc...","<cei:tenor xmlns:cei=""http://www.monasterium.n...",Napoli,,,1127 febbraio 16
3,"tag:www.monasterium.net,2011:/charter/069622fc...","<cei:tenor xmlns:cei=""http://www.monasterium.n...",Napoli,,,1518 marzo 3
4,"tag:www.monasterium.net,2011:/charter/069622fc...","<cei:tenor xmlns:cei=""http://www.monasterium.n...",Napoli,,,1559 settembre 11
...,...,...,...,...,...,...
89528,"tag:www.monasterium.net,2011:/charter/Wirtembe...","<cei:tenor xmlns:cei=""http://www.monasterium.n...",,,99999999,
89529,"tag:www.monasterium.net,2011:/charter/Wirtembe...","<cei:tenor xmlns:cei=""http://www.monasterium.n...",,,99999999,
89530,"tag:www.monasterium.net,2011:/charter/Wirtembe...","<cei:tenor xmlns:cei=""http://www.monasterium.n...",,,99999999,
89531,"tag:www.monasterium.net,2011:/charter/Wirtembe...","<cei:tenor xmlns:cei=""http://www.monasterium.n...",,,99999999,


In [5]:
charter_by_language = df.groupby("language")["atomID"].count()
charter_by_place = df.groupby("place")["atomID"].count()

In [7]:
charter_by_place.sort_values(ascending=False)
#print(...to_string())

place
Wien                     2188
o. O.                    1517
St. Gallen               1026
Konstanz                  582
Rom                       469
                         ... 
Kolín (Colonia)             1
Kolín                       1
Kojetín                     1
Koetweig                    1
„apud Briderichingen“       1
Name: atomID, Length: 5847, dtype: int64

In [6]:
charter_by_language.sort_values(ascending=False)

language
Deutsch                12425
Latein                  7856
lat.                    2932
deutsch                  190
Niederdeutsch            163
                       ...  
Latein, deutsch            1
Latein, Deutsch            1
Latein+Deutsch             1
Latein und Deutsch.        1
Nieder-/Oberdeutsch        1
Name: atomID, Length: 91, dtype: int64