# Middle High German Reference Corpus transformation

In [1]:
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
import datetime
import os
import pandas as pd
import re

In [2]:
pd.set_option('display.max_colwidth', 0)

In [6]:
directoryPath = "../../../../../data/corpora/REM/rem-corralled-20161222/"
fileExtension = ".xml"

In [7]:
def get_cei_files(base_dir):
    for entry in os.scandir(base_dir):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_cei_files(entry.path)
        else:
            continue

In [8]:
paths = [f"{PurePosixPath(path)}" for path in get_cei_files(directoryPath)]

In [9]:
lists = ["ids", "names", "tokens", "topics", "textTypes", "genres", "languages", "languageTypes", "languageRegions", "languageAreas", "centuries", "dates", "tokens_utf"]
ids, names, tokens_ascii, topics, textTypes, genres, languages, languageTypes, languageRegions, languageAreas, centuries, dates, tokens_utf = ([] for i in range(len(lists)))

In [10]:
for file in paths:
    with open(file, "r", encoding="utf-8") as f:
        root = etree.parse(f).getroot()
        ids.append(root.xpath("/text/@id", smart_strings = False))
        names.append(root.xpath("/text/header/text/text()", smart_strings = False))
        tokens_ascii.append(root.xpath("/text/token/tok_anno/@ascii", smart_strings = False))
        tokens_utf.append(root.xpath("/text/token/tok_anno/@utf", smart_strings = False))
        topics.append(root.xpath("/text/header/topic/text()", smart_strings = False))
        textTypes.append(root.xpath("/text/header/text-type/text()", smart_strings = False))
        genres.append(root.xpath("/text/header/genre/text()", smart_strings = False))
        languages.append(root.xpath("/text/header/language/text()", smart_strings = False))
        languageTypes.append(root.xpath("/text/header/language-type/text()", smart_strings = False))
        languageRegions.append(root.xpath("/text/header/language-region/text()", smart_strings = False))
        languageAreas.append(root.xpath("/text/header/language-area/text()", smart_strings = False))
        centuries.append(root.xpath("/text/header/time/text()", smart_strings = False))
        dates.append(root.xpath("/text/header/date/text()", smart_strings = False))

In [11]:
contents = list(zip(ids, names, tokens_ascii, topics, textTypes, genres, languages, languageTypes, languageRegions, languageAreas, centuries, dates, tokens_utf))
contents_full = pd.DataFrame(contents).rename(columns={0: "ids", 1: "names", 2: "tokens_ascii", 3: "topics", 4: "textTypes", 5: "genres", 6: "languages", 7:"languageTypes", 8: "languageRegions", 9: "languageAreas", 10: "centuries", 11: "dates", 12: "tokens_utf"})

In [12]:
contents_full["tokens_ascii_as_string"] = contents_full["tokens_ascii"].str.join(sep=" ")
contents_full["tokens_ascii_as_string"] = [re.sub(r' . ', '. ', str(x)) for x in contents_full["tokens_ascii_as_string"]]
contents_full["tokens_utf_as_string"] = contents_full["tokens_utf"].str.join(sep=" ")
contents_full["tokens_utf_as_string"] = [re.sub(r' . ', '. ', str(x)) for x in contents_full["tokens_utf_as_string"]]

In [14]:
# name output according to subcorpus

timemarker = datetime.today().strftime('%Y-%m-%d-%H%M')
contents_full.to_json(f"../data/output/REM2CSV_{timemarker}.json")
contents_full.to_parquet(f"../data/output/REM2CSV_{timemarker}.parquet")