# Middle Lower German Corpus transformation

In [1]:
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
from datetime import datetime
import os
import pandas as pd
import re

In [2]:
pd.set_option('display.max_colwidth', 0)

In [3]:
corpus_name = "CorA-ReN-XML_1.1" #only used for export file name
directoryPath = "../data/in/REN/CorA-ReN-XML_1.1/"
fileExtension = ".xml"

In [4]:
def get_cei_files(base_dir):
    for entry in os.scandir(base_dir):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_cei_files(entry.path)
        else:
            continue

In [5]:
paths = [f"{PurePosixPath(path)}" for path in get_cei_files(directoryPath)]

In [8]:
lists = ["ids", "names", "header", "tokens_anno_ascii", "tokens_anno_utf", "tokens_dipl_utf", "topics", "textTypes", "genres", "languages", "languageTypes", "languageRegions", "languageAreas", "centuries", "dates"]
ids, names, header, tokens_anno_ascii, tokens_anno_utf, tokens_dipl_utf, topics, textTypes, genres, languages, languageTypes, languageRegions, languageAreas, centuries, dates = ([] for i in range(len(lists)))

In [9]:
for file in paths:
    with open(file, "r", encoding="utf-8") as f:
        root = etree.parse(f).getroot()
        ids.append(root.xpath("/text/@id", smart_strings = False))
        names.append(root.xpath("/text/cora-header/@name", smart_strings = False))
        header.append(root.xpath("/text/header/text()", smart_strings = False))
        tokens_anno_ascii.append(root.xpath("/text/token/anno/@ascii", smart_strings = False))
        tokens_anno_utf.append(root.xpath("/text/token/anno/@utf", smart_strings = False))
        tokens_dipl_utf.append(root.xpath("/text/token/dipl/@utf", smart_strings = False))

In [13]:
contents = list(zip(ids, names, header, tokens_anno_ascii, tokens_anno_utf, tokens_dipl_utf))
contents_full = pd.DataFrame(contents).rename(columns={0: "ids", 1: "names", 2: "header", 3: "tokens_anno_ascii", 4: "tokens_anno_utf", 5: "tokens_dipl_utf"})

In [14]:
contents_full["string_anno_ascii"] = contents_full["tokens_anno_ascii"].str.join(sep=" ")
contents_full["string_anno_ascii"] = [re.sub(r' . ', '. ', str(x)) for x in contents_full["string_anno_ascii"]]

contents_full["string_anno_utf"] = contents_full["tokens_anno_utf"].str.join(sep=" ")
contents_full["string_anno_utf"] = [re.sub(r' . ', '. ', str(x)) for x in contents_full["string_anno_utf"]]

contents_full["string_dipl_utf"] = contents_full["tokens_dipl_utf"].str.join(sep=" ")
contents_full["string_dipl_utf"] = [re.sub(r' . ', '. ', str(x)) for x in contents_full["string_dipl_utf"]]

In [16]:
# name output according to subcorpus

timemarker = datetime.today().strftime('%Y-%m-%d-%H%M')
contents_full.to_json(f"../data/out/{corpus_name}_{timemarker}.json")
contents_full.to_parquet(f"../data/out/{corpus_name}_{timemarker}.parquet")