In [86]:
import os
import re
from bs4 import BeautifulSoup
import duckdb

In [89]:
conn = duckdb.connect("/srv/data/grela_v0-2.duckdb")

In [69]:
import os
import re
from bs4 import BeautifulSoup, NavigableString

def map_cc_pos(pos_tag, token_text):
    if pos_tag is None:
        return "X"

    if pos_tag.startswith("V:"):
        return "VERB"
    if pos_tag.startswith("N:"):
        return "NOUN"
    if pos_tag.startswith("ADJ"):
        return "ADJ"
    if pos_tag.startswith("ADV"):
        return "ADV"
    if pos_tag.startswith("PREP"):
        return "ADP"
    if pos_tag in {"CC", "CON", "CS"}:
        return "CCONJ" if pos_tag == "CC" else "SCONJ"
    if pos_tag in {"PRON", "REL", "DIMOS", "POSS", "DET"}:
        return "PRON"
    if pos_tag == "NUM" or "NUM" in pos_tag:
        return "NUM"
    if token_text and token_text[0].isupper():
        if pos_tag.startswith("N:") or pos_tag == "????":
            return "PROPN"
    return "X"

In [81]:
def parse_lemmatized_cc(filename):
    from bs4 import NavigableString

    with open(filename, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "xml")

    grela_id = os.path.splitext(os.path.basename(filename))[0]
    sentences = []
    tokens = []
    position = 0

    for div in soup.find_all("div", attrs={"cc:pid": True}):
        subwork_id = div.get("cc:pid").split(":", 1)[1] if ":" in div.get("cc:pid") else None

        for s_elem in div.find_all("cc:s", recursive=True):
            if not s_elem.get("n") or not s_elem.get("parent_pid"):
                continue

            sentence_tokens = []
            token_data = []
            char_index = 0

            for w in s_elem.find_all("cc:w"):
                token_text = w.text.strip()
                if not token_text:
                    continue

                lemma = re.sub(r"\d+$", "", w.get("lemma") or "")
                pos = map_cc_pos(w.get("pos"), token_text)

                start = char_index
                end = start + len(token_text)
                char_index = end + 1

                token_data.append({
                    "sentence_id": f"{grela_id}_{position}",
                    "grela_id": grela_id,
                    "token_text": token_text,
                    "lemma": lemma,
                    "pos": pos,
                    "char_start": start,
                    "char_end": end
                })

                sentence_tokens.append(token_text)
            sentence_tokens = [t[0] + t[1:].lower() if len(t) > 2 else t for t in sentence_tokens if t is not None and t != ""]
            if sentence_tokens:
                sentences.append({
                    "sentence_id": f"{grela_id}_{position}",
                    "grela_id": grela_id,
                    "position": position,
                    "text": " ".join(sentence_tokens),
                    "subwork_id": subwork_id
                })
                tokens.extend(token_data)
                position += 1

    return sentences, tokens

In [82]:
sentences, tokens = parse_lemmatized_cc("../data/cc_lemmatized_xmls/cc_10265.xml")

In [83]:
sentences

[{'sentence_id': 'cc_10265_0',
  'grela_id': 'cc_10265',
  'position': 0,
  'text': 'Registri Liber Primus',
  'subwork_id': '1'},
 {'sentence_id': 'cc_10265_1',
  'grela_id': 'cc_10265',
  'position': 1,
  'text': 'Epistola Prima',
  'subwork_id': '1'},
 {'sentence_id': 'cc_10265_2',
  'grela_id': 'cc_10265',
  'position': 2,
  'text': 'AD Desiderium Abbatem',
  'subwork_id': '1'},
 {'sentence_id': 'cc_10265_3',
  'grela_id': 'cc_10265',
  'position': 3,
  'text': 'Nuntiat se invitum in demortui Alexandri pontificis locum suffectum',
  'subwork_id': '1'},
 {'sentence_id': 'cc_10265_4',
  'grela_id': 'cc_10265',
  'position': 4,
  'text': 'Rogat ut Deum pro se deprecetur et ad se quantocius veniat',
  'subwork_id': '1'},
 {'sentence_id': 'cc_10265_5',
  'grela_id': 'cc_10265',
  'position': 5,
  'text': 'Gregorius in Romanum pontificem electus Desiderio abbati monasterii Sancti Benedicti Montis Cassini salutem in Christo Jesu',
  'subwork_id': '1'},
 {'sentence_id': 'cc_10265_6',
  'gr

In [98]:
import os
import pandas as pd

def replace_grela_sentences_and_tokens_duckdb(xml_path):
    grela_id = os.path.splitext(os.path.basename(xml_path))[0]

    # Parse XML
    sentences, tokens = parse_lemmatized_cc(xml_path)

    if not sentences:
        raise RuntimeError(f"⚠️ No sentence data parsed from {xml_path}")
    if not tokens:
        raise RuntimeError(f"⚠️ No token data parsed from {xml_path}")

    # Convert to DataFrames
    sentences_df = pd.DataFrame(sentences)
    tokens_df = pd.DataFrame(tokens)

    # Register DataFrames for DuckDB
    conn.register("sentences_df", sentences_df)
    conn.register("tokens_df", tokens_df)

    try:
        # Delete existing entries
        conn.execute("DELETE FROM sentences WHERE grela_id = ?", (grela_id,))
        conn.execute("DELETE FROM tokens WHERE grela_id = ?", (grela_id,))

        # Insert new data from registered DataFrames
        conn.execute("""
            INSERT INTO sentences (sentence_id, grela_id, position, text, subwork_id)
            SELECT sentence_id, grela_id, position, text, subwork_id FROM sentences_df
        """)

        conn.execute("""
            INSERT INTO tokens (sentence_id, grela_id, token_text, lemma, pos, char_start, char_end)
            SELECT sentence_id, grela_id, token_text, lemma, pos, char_start, char_end FROM tokens_df
        """)

        print(f"✅ Fast update: {grela_id} → {len(sentences)} sentences, {len(tokens)} tokens.")

    except Exception as e:
        raise RuntimeError(f"❌ DuckDB update failed for {grela_id}: {e}")

In [94]:
query = "ALTER TABLE sentences ADD COLUMN subwork_id TEXT;"
conn.execute(query)

<duckdb.duckdb.DuckDBPyConnection at 0x76393d971cf0>

In [99]:
replace_grela_sentences_and_tokens_duckdb(
    xml_path="../data/cc_lemmatized_xmls/cc_10265.xml"
)

✅ Fast update: cc_10265 → 5399 sentences, 130090 tokens.


In [100]:
grela_id = "cc_10265"
conn.execute(f"""
    UPDATE tokens
    SET token_id = subquery.new_id
    FROM (
        SELECT row_number() OVER () AS new_id, t.rowid AS real_rowid
        FROM tokens t
        WHERE grela_id = ?
    ) AS subquery
    WHERE tokens.rowid = subquery.real_rowid
""", (grela_id,))

<duckdb.duckdb.DuckDBPyConnection at 0x76393d971cf0>

In [101]:
register_df = conn.execute("""
    SELECT s.sentence_id, s.text, e.embedding
    FROM sentence_embeddings e
    JOIN sentences s ON e.sentence_id = s.sentence_id
    WHERE e.grela_id = 'cc_10265'
""").fetchdf()

In [102]:
register_df.head(5)

Unnamed: 0,sentence_id,text,embedding
0,cc_10265_0,Registri Liber Primus,"[-0.0411662794649601, -0.017861537635326385, 0..."
1,cc_10265_1,Epistola Prima,"[-0.039150845259428024, 0.04118065908551216, -..."
2,cc_10265_2,AD Desiderium Abbatem,"[-0.009886219166219234, -0.014916053973138332,..."
3,cc_10265_3,Nuntiat se invitum in demortui Alexandri ponti...,"[-0.008567786775529385, 0.05232088640332222, -..."
4,cc_10265_4,Rogat ut Deum pro se deprecetur et ad se quant...,"[-0.0074958037585020065, -0.022528640925884247..."


In [103]:
register_tokens_df = conn.execute("""
    SELECT t.*
    FROM tokens t
    WHERE t.grela_id = 'cc_10265'
""").fetchdf()

In [104]:
len(register_tokens_df)

130090

In [105]:
register_tokens_df[:20]

Unnamed: 0,sentence_id,grela_id,token_text,lemma,pos,char_start,char_end,token_id
0,cc_10265_0,cc_10265,REGISTRI,????,PROPN,0,8,1
1,cc_10265_0,cc_10265,LIBER,liber,NOUN,9,14,2
2,cc_10265_0,cc_10265,PRIMUS,unus,ADJ,15,21,3
3,cc_10265_1,cc_10265,EPISTOLA,epistola,NOUN,0,8,4
4,cc_10265_1,cc_10265,PRIMA,unus,ADJ,9,14,5
5,cc_10265_2,cc_10265,AD,ad,ADP,0,2,6
6,cc_10265_2,cc_10265,DESIDERIUM,desiderium,NOUN,3,13,7
7,cc_10265_2,cc_10265,ABBATEM,abbas,NOUN,14,21,8
8,cc_10265_3,cc_10265,Nuntiat,nuntio,VERB,0,7,9
9,cc_10265_3,cc_10265,se,se,PRON,8,10,10


In [106]:
conn.close()