 # Early New High German Reference Corpus transformation

In [None]:
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
import datetime
import os
import pandas as pd
import re

In [None]:
pd.set_option('display.max_colwidth', 50)

## mlu

In [None]:
directoryPath = "../../../../../data/corpora/REF/ReF-v1.0.2/ref-mlu/"
fileExtension = ".xml"

In [None]:
def get_cei_files(base_dir):
    for entry in os.scandir(base_dir):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_cei_files(entry.path)
        else:
            continue

In [None]:
paths = [f"{PurePosixPath(path)}" for path in get_cei_files(directoryPath)]

In [None]:
lists = ["ids", "names", "tokens_ascii", "tokens_utf", "headers"]
ids, names, tokens_ascii, tokens_utf, headers = ([] for i in range(len(lists)))

In [None]:
for file in paths:
    with open(file, "r", encoding="utf-8") as f:
        root = etree.parse(f).getroot()
        ids.append(root.xpath("/text/@id", smart_strings = False))
        names.append(root.xpath("/text/cora-header/@name", smart_strings = False))
        tokens_ascii.append(root.xpath("/text/token/tok_anno/@ascii", smart_strings = False))
        tokens_utf.append(root.xpath("/text/token/tok_anno/@utf", smart_strings = False))
        headers.append(root.xpath("/text/header/text()", smart_strings = False))

In [None]:
contents = list(zip(ids, names, tokens_ascii, tokens_utf, headers))
contents_full = pd.DataFrame(contents).rename(columns={0: "ids", 1: "names", 2: "tokens_ascii", 3: "tokens_utf", 4: "headers"})

In [None]:
contents_full["tokens_ascii_as_string"] = contents_full["tokens_ascii"].str.join(sep=" ")
contents_full["tokens_ascii_as_string"] = [re.sub(r' . ', '. ', str(x)) for x in contents_full["tokens_ascii_as_string"]]
contents_full["tokens_utf_as_string"] = contents_full["tokens_utf"].str.join(sep=" ")
contents_full["tokens_utf_as_string"] = [re.sub(r' . ', '. ', str(x)) for x in contents_full["tokens_utf_as_string"]]

## rub

In [None]:
directoryPath = "../../../../../data/corpora/REF/ReF-v1.0.2/ref-rub/"
fileExtension = ".xml"

In [None]:
def get_cei_files(base_dir):
    for entry in os.scandir(base_dir):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_cei_files(entry.path)
        else:
            continue

In [None]:
paths = [f"{PurePosixPath(path)}" for path in get_cei_files(directoryPath)]

In [None]:
lists = ["ids", "names", "tokens_ascii", "tokens_utf", "headers"]
ids, names, tokens_ascii, tokens_utf, headers = ([] for i in range(len(lists)))

In [None]:
for file in paths:
    with open(file, "r", encoding="utf-8") as f:
        root = etree.parse(f).getroot()
        ids.append(root.xpath("/text/@id", smart_strings = False))
        names.append(root.xpath("/text/cora-header/@name", smart_strings = False))
        tokens_ascii.append(root.xpath("/text/token/tok_anno/@ascii", smart_strings = False))
        tokens_utf.append(root.xpath("/text/token/tok_anno/@utf", smart_strings = False))
        headers.append(root.xpath("/text/header/text()", smart_strings = False))

In [None]:
contents = list(zip(ids, names, tokens_ascii, tokens_utf, headers))
contents_full = pd.DataFrame(contents).rename(columns={0: "ids", 1: "names", 2: "tokens_ascii", 3: "tokens_utf", 4: "headers"})

In [None]:
contents_full["tokens_ascii_as_string"] = contents_full["tokens_ascii"].str.join(sep=" ")
contents_full["tokens_ascii_as_string"] = [re.sub(r' . ', '. ', str(x)) for x in contents_full["tokens_ascii_as_string"]]
contents_full["tokens_utf_as_string"] = contents_full["tokens_utf"].str.join(sep=" ")
contents_full["tokens_utf_as_string"] = [re.sub(r' . ', '. ', str(x)) for x in contents_full["tokens_utf_as_string"]]

## up


In [None]:
directoryPath = "../../../../../data/corpora/REF/ReF-v1.0.2/ref-up/" #tigerXML
fileExtension = ".xml"

In [None]:
def get_cei_files(base_dir):
    for entry in os.scandir(base_dir):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_cei_files(entry.path)
        else:
            continue

In [None]:
paths = [f"{PurePosixPath(path)}" for path in get_cei_files(directoryPath)]

In [None]:
lists = ["ids", "names", "tokens", "textTypes", "genres", "languages", "languageTypes", "languageRegions", "languageAreas", "centuries", "dates"]
ids, names, tokens, textTypes, genres, languages, languageTypes, languageRegions, languageAreas, centuries, dates = ([] for i in range(len(lists)))

In [None]:
for file in paths:
    with open(file, "r", encoding="utf-8") as f:
        root = etree.parse(f).getroot()
        ids.append(root.xpath("/corpus/@id", smart_strings = False))
        names.append(root.xpath("/corpus/head/meta/text/text()", smart_strings = False))
        tokens.append(root.xpath("/corpus/body/s//graph/terminals/@word", smart_strings = False))
        textTypes.append(root.xpath("/corpus/head/meta/text-type/text()", smart_strings = False))
        genres.append(root.xpath("/corpus/head/meta/genre/text()", smart_strings = False))
        languages.append(root.xpath("/corpus/head/meta/language/text()", smart_strings = False))
        languageTypes.append(root.xpath("/corpus/head/meta/language-type/text()", smart_strings = False))
        languageRegions.append(root.xpath("/corpus/head/meta/language-region/text()", smart_strings = False))
        languageAreas.append(root.xpath("/corpus/head/meta/language-area/text()", smart_strings = False))
        centuries.append(root.xpath("/corpus/head/meta/time/text()", smart_strings = False))
        dates.append(root.xpath("/ccorpus/head/meta/date/text()", smart_strings = False))

In [None]:
contents = list(zip(ids, names, tokens_ascii, topics, textTypes, genres, languages, languageTypes, languageRegions, languageAreas, centuries, dates, tokens_utf))
contents_full = pd.DataFrame(contents).rename(columns={0: "ids", 1: "names", 2: "tokens_ascii", 3: "topics", 4: "textTypes", 5: "genres", 6: "languages", 7:"languageTypes", 8: "languageRegions", 9: "languageAreas", 10: "centuries", 11: "dates", 12: "tokens_utf"})

In [None]:
contents_full["tokens_ascii_as_string"] = contents_full["tokens_ascii"].str.join(sep=" ")
contents_full["tokens_ascii_as_string"] = [re.sub(r' . ', '. ', str(x)) for x in contents_full["tokens_ascii_as_string"]]
contents_full["tokens_utf_as_string"] = contents_full["tokens_utf"].str.join(sep=" ")
contents_full["tokens_utf_as_string"] = [re.sub(r' . ', '. ', str(x)) for x in contents_full["tokens_utf_as_string"]]

# Export

In [None]:
contents_full

In [None]:
# name output according to subcorpus

timemarker = datetime.today().strftime('%Y-%m-%d-%H%M')
contents_full.to_json(f"../data/output/REF2PD_{timemarker}.json")
contents_full.to_parquet(f"../data/output/REF2PD_{timemarker}.parquet")