# PALAFRAFRO-V2-2
from http://txm.ish-lyon.cnrs.fr/bfm/

In [1]:
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
from datetime import datetime
import os
import pandas as pd
import re
from random import sample
from pprint import pprint

In [2]:
pd.set_option('display.max_colwidth', 0)

In [3]:
corpus_name = "FraFro2PD2-2" #only used for export file name
directoryPath = "../data/in/PaLaFraFro2-2_TEI/"
targetPath = "../data/in/PaLaFraFro2-2_TEI_enriched/"
fileExtension = ".xml"
namespaces = {'tei': 'http://www.tei-c.org/ns/1.0', 'me': 'http://www.menota.org/ns/1.0', 'bfm': 'http://bfm.ens-lsh.fr/ns/1.0'}

In [4]:
def get_cei_files(base_dir):
    for entry in os.scandir(base_dir):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_cei_files(entry.path)
        else:
            continue

In [44]:
paths = [f"{PurePosixPath(path)}" for path in get_cei_files(directoryPath)]

In [45]:
# enrich/correct existing namespaces in tei-xml since annotators forgot namespace markup
for file in paths:
    with open(file, "r", encoding="utf-8") as file_in:
        filename = Path(file).name
        rf = file_in.read()
        new_file = rf.replace("<TEI xmlns", "<TEI xmlns:tei")
        with open(f"{targetPath}{filename}", "x") as file_out:
            file_out.write(new_file)

In [5]:
# generate corrected paths
paths = [f"{PurePosixPath(path)}" for path in get_cei_files(targetPath)]

In [67]:
lists = ["name_main", "name_formal", "date_comp_when", "date_comp_notBefore", "date_comp_notAfter", "date_ms_when", "date_ms_notBefore", "date_ms_notAfter", "region", "domain", "genre", "words"]
name_main, name_formal, date_comp_when, date_comp_notBefore, date_comp_notAfter, date_ms_when, date_ms_notBefore, date_ms_notAfter, region, domain, genre, words = ([] for i in range(len(lists)))

In [68]:
for file in paths:
    with open(file, "r", encoding="utf-8") as f:
        tree = etree.parse(f)
        name_main.append(tree.xpath("//titleStmt/title[@type='main']/text()"))
        name_formal.append(tree.xpath("//titleStmt/title[@type='formal']/text()"))
        date_comp_when.append(tree.xpath("//creation/date[@type='compo']/@when"))
        date_comp_notBefore.append(tree.xpath("//creation/date[@type='compo']/@notBefore"))
        date_comp_notAfter.append(tree.xpath("//creation/date[@type='compo']/@notAfter"))
        date_ms_when.append(tree.xpath("//creation/date[@type='ms']/@when"))
        date_ms_notBefore.append(tree.xpath("//creation/date[@type='ms']/@notBefore"))
        date_ms_notAfter.append(tree.xpath("//creation/date[@type='ms']/@notAfter"))
        region.append(tree.xpath("//creation/region/text()"))
        domain.append(tree.xpath("//keywords/term[@type='domaine']/text()"))
        genre.append(tree.xpath("//keywords/term[@type='genre']/text()"))
        words.append(tree.xpath("//ab/*/w/text()"))


In [69]:
contents = list(zip(name_main, name_formal, date_comp_when, date_comp_notBefore, date_comp_notAfter, date_ms_when, date_ms_notBefore, date_ms_notAfter, region, domain, genre, words))
contents_full = pd.DataFrame(contents).rename(columns={0: "name_main", 1: "name_formal", 2: "date_comp_when", 3: "date_comp_notBefore", 4: "date_comp_notAfter", 5: "date_ms_when", 6: "date_ms_notBefore", 7: "date_ms_notAfter", 8: "region", 9: "domain", 10: "genre", 11: "words"})

In [71]:
contents_full["string"] = contents_full["words"].str.join(sep=" ")
contents_full["string"] = [re.sub(r' . ', '. ', str(x)) for x in contents_full["string"]]

In [73]:
# name output according to subcorpus

timemarker = datetime.today().strftime('%Y-%m-%d-%H%M')
contents_full.to_json(f"../data/out/{corpus_name}_{timemarker}.json")
contents_full.to_parquet(f"../data/out/{corpus_name}_{timemarker}.parquet")