In [14]:
 import os
import re
from bs4 import BeautifulSoup, NavigableString
import json
import pandas as pd

In [2]:
def parse_tei_header_from_soup(soup, work_id: str) -> dict:
    """
    Extract basic TEI header metadata as a dict.

    Assumes tags are lowercase (BeautifulSoup with 'html.parser').
    """
    meta = {}

    teiheader = soup.find("teiheader")
    if not teiheader:
        return meta

    filedesc = teiheader.find("filedesc")
    if not filedesc:
        return meta

    # ----- titleStmt -----
    titlestmt = filedesc.find("titlestmt")
    if titlestmt:
        title_tag = titlestmt.find("title")
        if title_tag:
            # include nested content (e.g. <hi>, etc.)
            meta["title"] = " ".join(title_tag.stripped_strings)

        author_tag = titlestmt.find("author")
        if author_tag:
            # dates inside <author><date>...</date></author>
            date_tag = author_tag.find("date")
            if date_tag and date_tag.get_text(strip=True):
                author_dates = date_tag.get_text(strip=True)
                meta["author_dates"] = author_dates
                date_tag.extract()  # remove from author_tag so we get clean name

            author_name = author_tag.get_text(strip=True)
            if author_name:
                meta["author"] = author_name

            # VIAF link, if present
            ref = author_tag.get("ref")
            if ref:
                meta["author_viaf"] = ref
                # try to extract numeric VIAF id
                m = re.search(r"/(\d+)$", ref.strip("/"))
                if m:
                    meta["author_viaf_id"] = m.group(1)

    # ----- editionStmt -----
    editionstmt = filedesc.find("editionstmt")
    if editionstmt:
        edition_tag = editionstmt.find("edition")
        if edition_tag:
            meta["edition"] = " ".join(edition_tag.stripped_strings)

        editor_tag = editionstmt.find("editor")
        if editor_tag:
            meta["editor"] = " ".join(editor_tag.stripped_strings)

    # ----- publicationStmt -----
    publicationstmt = filedesc.find("publicationstmt")
    if publicationstmt:
        publisher_tag = publicationstmt.find("publisher")
        if publisher_tag:
            meta["publisher"] = publisher_tag.get_text(strip=True)

        pubplace_tag = publicationstmt.find("pubplace")
        if pubplace_tag:
            meta["pub_place"] = pubplace_tag.get_text(strip=True)

        date_tag = publicationstmt.find("date")
        if date_tag and date_tag.get_text(strip=True):
            meta["pub_date"] = date_tag.get_text(strip=True)

    # ----- seriesStmt -----
    seriesstmt = filedesc.find("seriesstmt")
    if seriesstmt:
        series_title_tag = seriesstmt.find("title")
        if series_title_tag:
            meta["series_title"] = " ".join(series_title_tag.stripped_strings)

        idno_tag = seriesstmt.find("idno")
        if idno_tag:
            meta["series_id"] = idno_tag.get_text(strip=True)

    # ----- notesStmt -----
    notesstmt = filedesc.find("notesstmt")
    if notesstmt:
        notes = [
            " ".join(note.stripped_strings)
            for note in notesstmt.find_all("note")
            if note.get_text(strip=True)
        ]
        if notes:
            meta["notes"] = notes

    # ----- sourceDesc -----
    sourcedesc = filedesc.find("sourcedesc")
    if sourcedesc:
        p_tag = sourcedesc.find("p")
        if p_tag:
            meta["source"] = " ".join(p_tag.stripped_strings)

    return meta

In [3]:
def normalize_latin(s: str) -> str:
    return s.lower().replace("v", "u").replace("j", "i")

def map_cc_pos(pos_tag, token_text):
    if pos_tag is None:
        return "X"

    if pos_tag.startswith("V:"):
        return "VERB"
    if pos_tag.startswith("N:"):
        return "NOUN"
    if pos_tag.startswith("ADJ"):
        return "ADJ"
    if pos_tag.startswith("ADV"):
        return "ADV"
    if pos_tag.startswith("PREP"):
        return "ADP"
    if pos_tag in {"CC", "CON", "CS"}:
        return "CCONJ" if pos_tag == "CC" else "SCONJ"
    if pos_tag in {"PRON", "REL", "DIMOS", "POSS", "DET"}:
        return "PRON"
    if pos_tag == "NUM" or "NUM" in pos_tag:
        return "NUM"
    # heuristic for unknown proper names etc.
    if token_text and token_text[0].isupper():
        if pos_tag.startswith("N:") or pos_tag == "????":
            return "PROPN"
    return "X"


def build_ref(div_pid, parent_pid, sent_n):
    """
    Build a generic ref dict from TEI/CC identifiers.
    This is *intentionally* vague but sentence-specific.
    """
    ref = {}
    if div_pid:
        ref["div_pid"] = div_pid            # e.g. "45:2.12"
        # Optionally split:
        # m = re.match(r"(?P<work>\d+):(?P<section>.+)", div_pid)
        # if m:
        #     ref["work_no"] = m.group("work")
        #     ref["section"] = m.group("section")
    if parent_pid:
        ref["parent_pid"] = parent_pid      # e.g. "45:2.12;2"
    if sent_n:
        ref["sent_n"] = sent_n              # e.g. "1"
    return ref

def parse_lemmatized_cc(filename):
    with open(filename, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    work_id = os.path.splitext(os.path.basename(filename))[0]

    # NEW: parse TEI header metadata
    metadata = parse_tei_header_from_soup(soup, work_id)

    sentences = []
    position = 0

    for div in soup.find_all("div", attrs={"cc:pid": True}):
        div_pid = div.get("cc:pid")  # e.g. "45:2.12"

        for s_elem in div.find_all("cc:s", recursive=True):
            if not s_elem.get("n") or not s_elem.get("parent_pid"):
                continue

            sent_n = s_elem.get("n")
            parent_pid = s_elem.get("parent_pid")
            ref_dict = build_ref(div_pid, parent_pid, sent_n)

            sentence_tokens = []
            token_data = []
            char_index = 0

            for w in s_elem.find_all("cc:w"):
                token_text = w.text.strip()
                if not token_text:
                    continue

                raw_lemma = (w.get("lemma") or "").partition("|")[0]
                raw_lemma = re.sub(r"\d+$", "", raw_lemma)
                # your "??" rule, if you keep it:
                if raw_lemma.startswith("??"):
                    lemma = token_text.lower().replace("v", "u").replace("j", "i")
                else:
                    lemma = raw_lemma.replace("v", "u").replace("j", "i")

                pos = map_cc_pos(w.get("pos"), token_text)

                start = char_index
                end = start + len(token_text)
                char_index = end + 1

                token_data.append({
                    "token_text": token_text,
                    "lemma": lemma,
                    "pos": pos,
                    "ref": ref_dict,
                    "char_start": start,
                    "char_end": end
                })
                sentence_tokens.append(token_text)

            sentence_tokens = [
                t[0] + t[1:].lower() if len(t) > 2 else t
                for t in sentence_tokens
                if t
            ]

            if sentence_tokens:
                sentences.append({
                    "work_id": work_id,
                    "sent_id": position,
                    "sent_text": " ".join(sentence_tokens),
                    "token_data": token_data
                })
                position += 1

    # Return both: metadata for the work, and the sentence list
    metadata["work_id"] = work_id
    metadata["prelemmatized"] = True
    return metadata, sentences

In [4]:
source_path = "../data/large_data/lemmatized-xmls/"
os.listdir(source_path)[:10]

['8209.xml',
 '11823.xml',
 '9537.xml',
 '12666.xml',
 '11811.xml',
 '14817.xml',
 '11332.xml',
 '52.xml',
 '7160.xml',
 '14403.xml']

In [5]:
filename = os.listdir(source_path)[7000]
metadata, sentences = parse_lemmatized_cc(os.path.join(source_path, filename))

In [6]:
sentences[:10]

[{'work_id': '10339',
  'sent_id': 0,
  'sent_text': 'Praefatio in canonum collectionem',
  'token_data': [{'token_text': 'Praefatio',
    'lemma': 'praefatio',
    'pos': 'NOUN',
    'ref': {'div_pid': '10339:1', 'parent_pid': '10339:1;1', 'sent_n': '1'},
    'char_start': 0,
    'char_end': 9},
   {'token_text': 'in',
    'lemma': 'in',
    'pos': 'ADP',
    'ref': {'div_pid': '10339:1', 'parent_pid': '10339:1;1', 'sent_n': '1'},
    'char_start': 10,
    'char_end': 12},
   {'token_text': 'canonum',
    'lemma': 'canon',
    'pos': 'NOUN',
    'ref': {'div_pid': '10339:1', 'parent_pid': '10339:1;1', 'sent_n': '1'},
    'char_start': 13,
    'char_end': 20},
   {'token_text': 'collectionem',
    'lemma': 'collectio',
    'pos': 'NOUN',
    'ref': {'div_pid': '10339:1', 'parent_pid': '10339:1;1', 'sent_n': '1'},
    'char_start': 21,
    'char_end': 33}]},
 {'work_id': '10339',
  'sent_id': 1,
  'sent_text': 'Beatissimo atque apostolico viro pontifici domno papae Victori tertio et omn

In [7]:
metadata

{'title': 'Praefatio in canonum collectionem',
 'author': 'Deusdedit cardinalis',
 'author_viaf': 'http://viaf.org/viaf/7776051',
 'author_viaf_id': '7776051',
 'edition': 'early modern edition, no apparatus',
 'editor': 'Jacques-Paul Migne',
 'publisher': 'J. P. Migne',
 'pub_place': 'Parisiis',
 'pub_date': '1854',
 'series_title': 'Patrologia Latina, vol. 150',
 'series_id': 'DeuCar.PrInCaC',
 'source': 'Patrologia Latina',
 'work_id': '10339',
 'prelemmatized': True}

In [20]:
target_jsons = "/srv/data/corpus-corporum/sents_jsons_dicts/"
#target_jsons = "../data/test_sents_jsons/"
os.makedirs(target_jsons, exist_ok=True)
lemmatized_metadata = []
failed = []
for filename in os.listdir(source_path):
    try:
        metadata, sentences = parse_lemmatized_cc(os.path.join(source_path, filename))
        with open(os.path.join(target_jsons,  filename.replace(".xml", ".json")), "w", encoding="utf-8") as f:
            json.dump(sentences, f, ensure_ascii=False, indent=2)
        lemmatized_metadata.append(metadata)
    except:
        failed.append(filename)

In [21]:
lemmatized_metadata_df = pd.DataFrame(lemmatized_metadata)

In [22]:
len(lemmatized_metadata_df)

7690

In [23]:
lemmatized_metadata_df.to_csv("../data/lemmatized_metadata.csv", index=False)