In [92]:
import requests
import re
import pandas as pd
import xml.etree.ElementTree as ET
import json
import os
import sys
import matplotlib.pyplot as plt
from collections import Counter
import xml.etree.ElementTree as ET
from typing import List, Dict, Any, Optional
import re
import unicodedata
import pandas as pd

pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', 500)

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library


In [93]:
file_data = json.load(open(os.path.expanduser("../../../ServiceAccountsKey.json")))
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(
    ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

ruland_gs = gc.open_by_url(
    "https://docs.google.com/spreadsheets/d/1od248fJpNbwMO8IVZQVO3xnzrmmTe68yS9XWajyuk5o/edit?usp=sharing")

In [94]:
# Define the URL for the XML file
url = "https://raw.githubusercontent.com/sarahalang/alchemical-dictionaries/refs/heads/main/Ruland1612/Ruland.xml"

# Fetch the XML file from the URL
response = requests.get(url)

In [95]:
xml_content = response.content
tree = ET.ElementTree(ET.fromstring(xml_content))
root = tree.getroot()
TEI_NS = {'tei': 'http://www.tei-c.org/ns/1.0'}


In [96]:
entries = root.findall('.//tei:entry', TEI_NS)

In [43]:


# ---------------------------
# Compact cleaning & getters
# ---------------------------

P_LEADPUNCT = re.compile(r'^[\s,;:.]+')
P_JOIN_WRAP = re.compile(r'(?<=[A-Za-z])-\s+(?=[a-z])')   # ar- gentum -> argentum
P_WS        = re.compile(r'\s+')
P_CLAUSE    = re.compile(r'\s*(?:[.;:](?:\s|$)|\bid est\b)\s*', flags=re.I)

def clean(s: str | None) -> str | None:
    """Dehyphenate, normalize whitespace, and tidy punctuation spacing."""
    if not isinstance(s, str):
        return None
    # remove soft hyphen & wrap markers
    s = s.replace("\xad", "").replace("¬", "")
    # join likely line-wrap hyphens (ar- gentum -> argentum)
    s = P_JOIN_WRAP.sub('', s)
    # normalize whitespace early
    s = P_WS.sub(' ', s).strip()
    # punctuation tidy:
    #  - remove space before . , ; : ! ?
    s = re.sub(r'\s+([.,;:!?])', r'\1', s)
    #  - ensure a space after comma when missing (letters only, conservative, unicode-safe; don't touch decimals)
    s = re.sub(r'(?<!\d),([^\W\d_])', r', \1', s)
    #  - fix brackets spacing
    s = re.sub(r'([(\[])\s+', r'\1', s)   # no space after opening
    s = re.sub(r'\s+([)\]])', r'\1', s)   # no space before closing
    # re-normalize spaces
    s = P_WS.sub(' ', s).strip()
    return s or None

def text(el) -> str | None:
    """Flatten element itertext and clean."""
    if el is None:
        return None
    return clean(''.join(el.itertext()))

def ascii_fold(s: str | None) -> str | None:
    if s is None:
        return None
    s_nfkd = unicodedata.normalize('NFKD', s)
    return ''.join(c for c in s_nfkd if not unicodedata.combining(c)).lower()

# --- new helper: prefer <orth> children inside lemma form --------------------

def parse_lemma_form(entry):
    """
    If <form type='lemma'> contains <orth> children, take the first as Lemma
    and treat the remaining <orth> items as variants. Otherwise, fall back to
    the cleaned text of the lemma <form>.
    Returns: (lemma_text | None, lemma_source | None, extra_variants: list[str])
    """
    lemma_el = entry.find('.//tei:form[@type="lemma"]', TEI_NS)
    extra_variants = []
    if lemma_el is None:
        return None, None, extra_variants

    # collect <orth> tokens if present
    orths = [text(o) for o in lemma_el.findall('.//tei:orth', TEI_NS)]
    orths = [o for o in orths if o]

    if orths:
        lemma = orths[0]
        extra_variants = orths[1:]
        return lemma, "lemma-orth", extra_variants

    # fallback: whole form text (may include commas etc., but no <orth>s exist)
    lemma_txt = text(lemma_el)
    if lemma_txt:
        return lemma_txt, "lemma", extra_variants

    return None, None, extra_variants

def translations_under(parent):
    return [
        {
            "type": cit.attrib.get('type'),
            "xml_lang": cit.attrib.get('{http://www.w3.org/XML/1998/namespace}lang'),
            "quote": q
        }
        for cit in parent.findall('.//tei:cit', TEI_NS)
        for qel in [cit.find('.//tei:quote', TEI_NS)]
        for q in [text(qel)]
        if q
    ]

# ---------------------------
# Lemma guessers (compact)
# ---------------------------

ID_HEAD_RE = re.compile(r'^[^-]+-(.+)$')           # after first dash
STOP_LEMMA_TOKENS = {"vel", "id est", "idest", "i.e.", "sive"}

def head_from_n(nval: str | None) -> str | None:
    if not nval: return None
    m = ID_HEAD_RE.match(nval)
    if not m: return None
    head = m.group(1).split(',', 1)[0].replace('-', ' ')
    return clean(head)

def head_from_sense(s_el) -> str | None:
    s = text(s_el)
    if not s: return None
    # take leading chunk before comma/semicolon/period *or* 'id est'
    head = re.split(r'\s*(?:,|;|\.|\bid est\b)\s*', s, maxsplit=1, flags=re.I)[0]
    return clean(head)

def pick_variant_head(variant_els):
    for v in variant_els:
        vt = text(v)
        if not vt:
            continue
        head = re.split(r'\s*(?:,|;|\.)\s*', vt, maxsplit=1)[0].strip()
        if head and head.lower() not in STOP_LEMMA_TOKENS:
            return head
    return None

def guess_lemma(entry):
    lemma_el  = entry.find('.//tei:form[@type="lemma"]', TEI_NS)
    if (lemma := text(lemma_el)):                 return lemma, "lemma"

    phrase_el = entry.find('.//tei:form[@type="phrase"]', TEI_NS)
    if (phrase := text(phrase_el)):               return phrase, "phrase"

    var_els   = entry.findall('.//tei:form[@type="variant"]', TEI_NS)
    if (v := pick_variant_head(var_els)):         return v, "variant"

    sense_el  = entry.find('.//tei:sense', TEI_NS)
    if (sh := head_from_sense(sense_el)):         return sh, "sense-head"

    nval = entry.attrib.get('n')
    if (nh := head_from_n(nval)):                 return nh, "n-head"

    return None, None

# ---------------------------
# Sense definition extraction
# ---------------------------

def merge_defs_under(s_el) -> str | None:
    defs = s_el.findall('.//tei:def', TEI_NS) if s_el is not None else []
    parts = [text(d) for d in defs]
    parts = [p for p in parts if p]
    if not parts:
        return None
    return clean(' ; '.join(parts))

def prose_before_first_cit(s_el) -> str | None:
    """Plain text before first <cit>; keep comma appositions when 'id est' is present."""
    if s_el is None:
        return None
    pieces = []
    if s_el.text: pieces.append(s_el.text)
    for child in list(s_el):
        if child.tag == f"{{{TEI_NS['tei']}}}cit":
            break
        pieces.append(''.join(child.itertext()))
        if child.tail:
            pieces.append(child.tail)
    s = clean(''.join(pieces))
    if not s:
        return None
    # If 'id est' present, only cut at sentence-ending punct (.;:)
    if re.search(r'\bid\s+est\b', s, flags=re.I):
        s = re.split(r'\s*[.;:](?:\s|$)', s, maxsplit=1)[0]
    else:
        s = P_CLAUSE.split(s, maxsplit=1)[0]
    s = clean(s)
    if not s:
        return None
    s = P_LEADPUNCT.sub('', s).strip()
    return s or None

def extract_sense_def(s_el) -> str | None:
    return (merge_defs_under(s_el) or prose_before_first_cit(s_el)) or None

# ---------------------------
# Build entries_df (1 row per TEI <entry>)
# ---------------------------

def build_entries_df(entries):
    rows = []
    for entry in entries:
        entry_id   = entry.attrib.get('n')
        entry_type = entry.attrib.get('type')
        xml_id     = entry.attrib.get('{http://www.w3.org/XML/1998/namespace}id')

        # --- lemma: prefer <orth> tokens if available
        lemma_primary, lemma_src, lemma_orth_variants = parse_lemma_form(entry)
        if lemma_primary:
            lemma = lemma_primary
        else:
            # fallback to the existing guessers
            lemma, lemma_src = guess_lemma(entry)

        # trim trivial trailing punct on the lemma only
        if lemma:
            lemma = re.sub(r'[\s,;:.]+$', '', lemma).strip() or None
        # variants: standard <form type="variant"> plus extra <orth> (except the lemma itself)
        variants = [t for t in (text(v) for v in entry.findall('.//tei:form[@type="variant"]', TEI_NS)) if t]
        for v in (lemma_orth_variants or []):
            if v and v != lemma and v not in variants:
                variants.append(v)
        lemma = " ".join([w[0] + w[1:].lower() for w in lemma.split()])
        notes = [t for t in (text(n) for n in entry.findall('.//tei:note', TEI_NS)) if t]

        senses_list = []
        for i, s in enumerate(entry.findall('.//tei:sense', TEI_NS), start=1):
            senses_list.append({
                "index": i,
                "def": extract_sense_def(s),
                "raw": text(s),
                "translations": translations_under(s)
            })

        entry_level_trans = [
            {
                "type": c.attrib.get('type'),
                "xml_lang": c.attrib.get('{http://www.w3.org/XML/1998/namespace}lang'),
                "quote": q
            }
            for xp in ['./tei:cit', './tei:dictScrap/tei:cit']
            for c in entry.findall(xp, TEI_NS)
            for qel in [c.find('.//tei:quote', TEI_NS)]
            for q in [text(qel)]
            if q
        ]

        rows.append({
            "ID": entry_id,
            "Type": entry_type,
            "XML_ID": xml_id,
            "Lemma": lemma,
            "LemmaSource": lemma_src,
            "target_canonical": ascii_fold(lemma),
            "Phrase": text(entry.find('.//tei:form[@type="phrase"]', TEI_NS)),
            "Variants": variants,
            "Notes": notes,
            "Senses": senses_list,
            "SenseDef": [s["def"] for s in senses_list],
            "SenseRaw": [s["raw"] for s in senses_list],
            "Translations": [s["translations"] for s in senses_list],
            "EntryLevelTranslations": entry_level_trans,
            "SenseCount": len(senses_list),
        })

    df = pd.DataFrame(rows)
    df["EntryIndexPerLemma"] = df.groupby("Lemma", dropna=False).cumcount() + 1
    return df

In [44]:
entries_df = build_entries_df(entries)

In [49]:
def build_lexeme_df(entries_df: pd.DataFrame, max_text_len: int | None = 1000) -> pd.DataFrame:
    """
    Collapse entry-level rows into lexeme-level rows (one row per (Lemma, LemmaCanonical)).

    Columns produced (per lemma):
      - Lemma, LemmaCanonical
      - entry_ids, entry_count
      - types, variants, notes
      - senses_flat            : list of dicts {entry_id, sense_idx, def, raw, translations}
      - sense_count
      - sense_def_all          : list[str], len == sense_count, defs with fallback to raw
      - sense_def_strict_all   : list[Optional[str]], original defs (may contain None)
      - sense_raw_all          : list[str], len == sense_count ("" if missing)
      - translations_all       : flattened list of dicts (entry-level + per-sense)
    Truncation:
      - If max_text_len is not None, truncate strings in definition-like fields to that length.
    """
    def _uniq(seq):
        seen = set(); out=[]
        for x in seq:
            if x is not None and x not in seen:
                seen.add(x); out.append(x)
        return out

    def _truncate_text(s: str | None, n: int) -> str | None:
        if not isinstance(s, str): return s
        return s if len(s) <= n else s[: n - 1] + "…"

    def _truncate_list_str(lst, n: int):
        if not isinstance(lst, list): return lst
        return [(_truncate_text(x, n) if isinstance(x, str) else x) for x in lst]

    g = entries_df.groupby(["Lemma","target_canonical"], dropna=False)
    rows = []

    for (lemma, target_canonical), grp in g:
        entry_ids = grp["ID"].tolist()

        # Flatten senses and collect strict/filled defs in one pass (preserve order)
        senses_flat = []
        sense_def_strict_all = []
        sense_raw_all = []

        for _, r in grp.iterrows():
            eid = r["ID"]
            for s in (r.get("Senses") or []):
                d  = s.get("def")
                rw = s.get("raw")
                senses_flat.append({
                    "entry_id": eid,
                    "sense_idx": s.get("index"),
                    "def": d,
                    "raw": rw,
                    "translations": s.get("translations"),
                })
                sense_def_strict_all.append(d)               # may be None
                sense_raw_all.append(rw if isinstance(rw, str) else "")  # keep length aligned

        sense_count = len(senses_flat)

        # Aligned defs: fill None with the corresponding raw (guaranteed same length)
        sense_def_all = [
            (d if isinstance(d, str) and d != "" else sense_raw_all[i])
            for i, d in enumerate(sense_def_strict_all)
        ]

        # Translations: entry-level + per-sense
        translations_all = []
        for _, r in grp.iterrows():
            eid = r["ID"]
            for t in (r.get("EntryLevelTranslations") or []):
                if isinstance(t, dict):
                    translations_all.append({"entry_id": eid, **t})
            for s in (r.get("Senses") or []):
                for t in (s.get("translations") or []):
                    if isinstance(t, dict):
                        translations_all.append({"entry_id": eid, **t})

        # Optional truncation (safe for Sheets)
        if isinstance(max_text_len, int) and max_text_len > 0:
            sense_def_all         = _truncate_list_str(sense_def_all, max_text_len)
            sense_def_strict_all  = _truncate_list_str(sense_def_strict_all, max_text_len)
            sense_raw_all         = _truncate_list_str(sense_raw_all, max_text_len)
            # also trim inside senses_flat["def"/"raw"] to avoid huge JSON dumps if you serialize
            for it in senses_flat:
                if isinstance(it.get("def"), str):
                    it["def"] = _truncate_text(it["def"], max_text_len)
                if isinstance(it.get("raw"), str):
                    it["raw"] = _truncate_text(it["raw"], max_text_len)

        rows.append({
            "Lemma":  lemma, # " ".join([w[0] + w[1:].lower() for w in lemma.split()]),
            "target_canonical": target_canonical,
            "entry_ids": entry_ids,
            "entry_count": len(entry_ids),
            "types": _uniq(grp["Type"].tolist()),
            "variants": _uniq([v for lst in grp["Variants"].tolist() if isinstance(lst, list) for v in lst]),
            "notes": _uniq([n for lst in grp["Notes"].tolist()    if isinstance(lst, list) for n in lst]),
            "senses_flat": senses_flat,
            "sense_count": sense_count,
            "sense_def_all": sense_def_all,                 # filled -> aligned with raw
            "sense_def_strict_all": sense_def_strict_all,   # original (may have None)
            "sense_raw_all": sense_raw_all,                 # aligned ("" if missing)
            "translations_all": translations_all,
        })

    return pd.DataFrame(rows).sort_values(
        ["sense_count","entry_count","target_canonical"],
        ascending=[False, False, True]
    ).reset_index(drop=True)

In [70]:
lexeme_df  = build_lexeme_df(entries_df)
lexeme_df.head(5)

Unnamed: 0,Lemma,target_canonical,entry_ids,entry_count,types,variants,notes,senses_flat,sense_count,sense_def_all,sense_def_strict_all,sense_raw_all,translations_all
0,Naphtha,naphtha,[Ruland1612-Naphtha],1,[N],[],[],"[{'entry_id': 'Ruland1612-Naphtha', 'sense_idx': 1, 'def': 'id est, pir, ignis.; id est, flatus minerae, aurichalcum; id est, nitrum, vel natron.; est sal est ex humiditate nebulae saepius in pratis supra Ein saltz auß lapides decidentis, calore solis induratum. Feuchtigkeit deß Nebelo sich setzent vnd von der Sonnen erhartend.; sunt medicamenta mortem pellentia, & vitam conseruantia, Necrolica.; est ars illicita, quę cum mortuis operabatur olim, vt cum astra manifestabantur apud mortuos.; e...",15,"[id est, pir, ignis.; id est, flatus minerae, aurichalcum; id est, nitrum, vel natron.; est sal est ex humiditate nebulae saepius in pratis supra Ein saltz auß lapides decidentis, calore solis induratum. Feuchtigkeit deß Nebelo sich setzent vnd von der Sonnen erhartend.; sunt medicamenta mortem pellentia, & vitam conseruantia, Necrolica.; est ars illicita, quę cum mortuis operabatur olim, vt cum astra manifestabantur apud mortuos.; est pellicula, vel oculis, vel auriculis, infantum adnascens...","[id est, pir, ignis.; id est, flatus minerae, aurichalcum; id est, nitrum, vel natron.; est sal est ex humiditate nebulae saepius in pratis supra Ein saltz auß lapides decidentis, calore solis induratum. Feuchtigkeit deß Nebelo sich setzent vnd von der Sonnen erhartend.; sunt medicamenta mortem pellentia, & vitam conseruantia, Necrolica.; est ars illicita, quę cum mortuis operabatur olim, vt cum astra manifestabantur apud mortuos.; est pellicula, vel oculis, vel auriculis, infantum adnascens...","[Iudaicum bitumen diximus esse speciem Naphthae, & non naphtham ipsam. Quoniam naphtha nihil aliud est, quam petroleum, oleum illud liquidum, quod ad nos, licet corruptum, ferunt agyrtae, Dioscor. lib. I. cap. 84. naphtham a Babyloniis vocari dicit bituminis colamen, & esse seu inueniri candidam & nigram. Naphtha candida est petroleum nostrum. Nigra, quae est forsitan illa Amiani pisca & glutinosa, bitumme persimilis, quae flagrans nulla alia re exstingui potest, quam puluere iniecto. Sicuti...","[{'entry_id': 'Ruland1612-Naphtha', 'type': 'translation', 'xml_lang': 'de', 'quote': 'die Steinkolen'}, {'entry_id': 'Ruland1612-Naphtha', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Mosch.'}, {'entry_id': 'Ruland1612-Naphtha', 'type': 'translation', 'xml_lang': 'de', 'quote': 'was Schmertzen stillet vnd onempfindtlich macht'}, {'entry_id': 'Ruland1612-Naphtha', 'type': 'translation', 'xml_lang': 'de', 'quote': 'also genandt.'}, {'entry_id': 'Ruland1612-Naphtha', 'type': 'translation..."
1,Nitrum,nitrum,"[None, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum]",9,[N],[],[],"[{'entry_id': None, 'sense_idx': 1, 'def': 'Baurach, sal petrosum, nitrum à German', 'raw': 'Baurach, sal petrosum, nitrum à German. SalpeterBergsaltz /quasi sal petrae, cal. & siccum in 2. gradu. Estque nitrum res cognata sali, & quae est species salis. Hinc sal lucidum,', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'SalpeterBergsaltz'}]}, {'entry_id': 'Ruland1612-Nitrum', 'sense_idx': 1, 'def': 'Fossile natiuum', 'raw': 'Fossile natiuum. Huius species similes sunt f...",9,"[Baurach, sal petrosum, nitrum à German, Fossile natiuum, ist ein Geschlecht Saltzes auch ist es borax, vnd etli herley Geschlecht armeni, etliches schwartz reiliches roth etliches saphirinum., est salsugo quaedam ex vrinarum congregationibus in sal praeparata., Baurach, Niter, Sago, Tincar, id est, sal albus, scil. baurac.]","[Baurach, sal petrosum, nitrum à German, Fossile natiuum, None, est salsugo quaedam ex vrinarum congregationibus in sal praeparata., None, None, Sago, Tincar, id est, sal albus, scil. baurac.]","[Baurach, sal petrosum, nitrum à German. SalpeterBergsaltz /quasi sal petrae, cal. & siccum in 2. gradu. Estque nitrum res cognata sali, & quae est species salis. Hinc sal lucidum,, Fossile natiuum. Huius species similes sunt fali fossili. Sunt aut burach. Artificiale fastitiu, ficuti ipse flos parietis, seu petrae, vel flos falis. Vide denillo etiam Gal. Coquitur hoc ex sale vel aqua falis. 1. Armenum, ab armenta patria, & optimum. 2. Ab burach. album simile omnino fali fossili, saporem hab...","[{'entry_id': None, 'type': 'translation', 'xml_lang': 'de', 'quote': 'SalpeterBergsaltz'}, {'entry_id': 'Ruland1612-Nitrum', 'type': 'translation', 'xml_lang': 'de', 'quote': 'ist ein Geschlecht Saltzes auch ist es borax, vnd etli herley Geschlecht armeni, etliches schwartz reiliches roth etliches saphirinum.'}, {'entry_id': 'Ruland1612-Nitrum', 'type': 'translation', 'xml_lang': 'de', 'quote': 'vnd etli herley Geschlecht armeni, etliches schwartz reiliches roth etliches'}, {'entry_id': 'Ru..."
2,Baurac,baurac,"[Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac]",8,[B],[],[],"[{'entry_id': 'Ruland1612-Baurac', 'sense_idx': 1, 'def': None, 'raw': 'wird vom Schaum deß Glases oder Nitere.', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'wird vom Schaum deß Glases oder Nitere.'}]}, {'entry_id': 'Ruland1612-Baurac', 'sense_idx': 1, 'def': 'id est, sal gemmae.', 'raw': 'id est, sal gemmae.', 'translations': []}, {'entry_id': 'Ruland1612-Baurac', 'sense_idx': 1, 'def': 'i. e. id est sapphirium lithargyrum albificatum', 'raw': 'i. e. id est sapphiri...",8,"[wird vom Schaum deß Glases oder Nitere., id est, sal gemmae., i. e. id est sapphirium lithargyrum albificatum, id est, sal vitri; fex vitri, spuma vitri, fel vitri., id est, attinckar., id est, quodlibet genus salsuginis., ist Pflaster damit die Fugen vinbstrichen werden / als deß Golds mit Eyerclar und Mehl, ist auch gesaltzen nitrum Armenisch und schwefelisch]","[None, id est, sal gemmae., i. e. id est sapphirium lithargyrum albificatum, id est, sal vitri; fex vitri, spuma vitri, fel vitri., id est, attinckar., id est, quodlibet genus salsuginis., None, None]","[wird vom Schaum deß Glases oder Nitere., id est, sal gemmae., i. e. id est sapphirium lithargyrum albificatum, weiß gemacht: Glett., id est, sal vitri; fex vitri, spuma vitri, fel vitri., id est, attinckar., id est, quodlibet genus salsuginis., ist Pflaster damit die Fugen vinbstrichen werden / als deß Golds mit Eyerclar und Mehl, ist auch gesaltzen nitrum Armenisch und schwefelisch]","[{'entry_id': 'Ruland1612-Baurac', 'type': 'translation', 'xml_lang': 'de', 'quote': 'wird vom Schaum deß Glases oder Nitere.'}, {'entry_id': 'Ruland1612-Baurac', 'type': 'translation', 'xml_lang': 'de', 'quote': 'weiß gemacht: Glett'}, {'entry_id': 'Ruland1612-Baurac', 'type': 'translation', 'xml_lang': 'de', 'quote': 'ist Pflaster damit die Fugen vinbstrichen werden / als deß Golds mit Eyerclar und Mehl'}, {'entry_id': 'Ruland1612-Baurac', 'type': 'translation', 'xml_lang': 'de', 'quote': ..."
3,Elixir,elixir,"[Ruland1612-Elixir,-Elei,, Ruland1612-Elixir, Ruland1612-Elixir, Ruland1612-Elixir, Ruland1612-Elixir, Ruland1612-Elixir-vel-Xir]",6,[E],"[Elei, Xir]",[],"[{'entry_id': 'Ruland1612-Elixir,-Elei,', 'sense_idx': 1, 'def': 'id est, Medicina.', 'raw': 'id est, Medicina.', 'translations': []}, {'entry_id': 'Ruland1612-Elixir', 'sense_idx': 1, 'def': 'est fermentum, cuius minima quantitas sui generis ingentem molem in pastam sibi similem penetrando conglutinat', 'raw': 'est fermentum, cuius minima quantitas sui generis ingentem molem in pastam sibi similem penetrando conglutinat. Pasta h. I. est spiritus vitę hominum, internusque vitae balsamus: est...",5,"[id est, Medicina., est fermentum, cuius minima quantitas sui generis ingentem molem in pastam sibi similem penetrando conglutinat, est species ex pluribus diuersi generis simplicium specieb. composita., ist das ferment, Vrheb Deyssendi Sawrteig / ist dassso auß Wasser wirdt. Dann Yxir ist Wasser / es ist ein gefaͤrbet Wasser gemenget mit den Coͤrpern / ist auch der weisse SteinOli vnd Puluer / dann das ist alles ein Ding / heist auch der Sehatz heist auch prima materia, st imperfect Elixit,...","[id est, Medicina., est fermentum, cuius minima quantitas sui generis ingentem molem in pastam sibi similem penetrando conglutinat, est species ex pluribus diuersi generis simplicium specieb. composita., None, Ist eine durchdringende Artzney / vund wirdt gemacht von den vegetabilibus, id est, quatuor spiritibus, mit Zuthuung eines corporis, welches corpus ist ein ferment einer Artzney.]","[id est, Medicina., est fermentum, cuius minima quantitas sui generis ingentem molem in pastam sibi similem penetrando conglutinat. Pasta h. I. est spiritus vitę hominum, internusque vitae balsamus: est interna corporis conseruattix in eo statu, in quo illud reperit. Elixir autem externus arte balsamus ab externis conquisitus ac praeparatus in spagiricum fermentum. Vel:, est species ex pluribus diuersi generis simplicium specieb. composita. Itaque cum oleum terebinthi componitur cum floribus...","[{'entry_id': 'Ruland1612-Elixir', 'type': 'translation', 'xml_lang': 'de', 'quote': 'ist das ferment, Vrheb Deyssendi Sawrteig / ist dassso auß Wasser wirdt. Dann Yxir ist Wasser / es ist ein gefaͤrbet Wasser gemenget mit den Coͤrpern / ist auch der weisse SteinOli vnd Puluer / dann das ist alles ein Ding / heist auch der Sehatz heist auch prima materia, st imperfect Elixit, wans Q perfect und bereit ist/ so ist es freylich ein Schatz. Lacinius spricht: Elixir wirdt gemacht auß dreyen / Sol..."
4,Magnesia,magnesia,"[Ruland1612-Magnesia, Ruland1612-Magnesia, Ruland1612-Magnesia, Ruland1612-Magnesia, Ruland1612-Magnesia]",5,[M],[],[],"[{'entry_id': 'Ruland1612-Magnesia', 'sense_idx': 1, 'def': 'id est, testudo vel sulphul.', 'raw': 'id est, testudo vel sulphul.', 'translations': []}, {'entry_id': 'Ruland1612-Magnesia', 'sense_idx': 1, 'def': 'Ist das gem schte Wasser im Lufft congelirt, daß dem Fewer widerstehet die Erde deß Steine sonser mercurius, mistio substantiarum, daß ganz darinn mercurius ist.', 'raw': 'Ist das gem schte Wasser im Lufft congelirt, daß dem Fewer widerstehet die Erde deß Steine sonser mercurius, mis...",5,"[id est, testudo vel sulphul., Ist das gem schte Wasser im Lufft congelirt, daß dem Fewer widerstehet die Erde deß Steine sonser mercurius, mistio substantiarum, daß ganz darinn mercurius ist., Ist ein Stein in der Krafft deß Marcasitae, oder es ist ein Stein dem haematiti gleich., id est, foemina., Wißmath oder taub Ertz.]","[id est, testudo vel sulphul., Ist das gem schte Wasser im Lufft congelirt, daß dem Fewer widerstehet die Erde deß Steine sonser mercurius, mistio substantiarum, daß ganz darinn mercurius ist., Ist ein Stein in der Krafft deß Marcasitae, oder es ist ein Stein dem haematiti gleich., id est, foemina., None]","[id est, testudo vel sulphul., Ist das gem schte Wasser im Lufft congelirt, daß dem Fewer widerstehet die Erde deß Steine sonser mercurius, mistio substantiarum, daß ganz darinn mercurius ist., Ist ein Stein in der Krafft deß Marcasitae, oder es ist ein Stein dem haematiti gleich., id est, foemina., Wißmath oder taub Ertz.]","[{'entry_id': 'Ruland1612-Magnesia', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Ist das gem schte Wasser im Lufft congelirt, daß dem Fewer widerstehet die Erde deß Steine sonser mercurius, mistio substantiarum, daß ganz darinn mercurius ist.'}, {'entry_id': 'Ruland1612-Magnesia', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Ist ein Stein in der Krafft deß Marcasitae, oder es ist ein Stein dem haematiti gleich.'}, {'entry_id': 'Ruland1612-Magnesia', 'type': 'translation', 'xml_l..."


In [74]:
lexeme_df[lexeme_df.duplicated(subset="target_canonical", keep=False)]

Unnamed: 0,Lemma,target_canonical,entry_ids,entry_count,types,variants,notes,senses_flat,sense_count,sense_def_all,sense_def_strict_all,sense_raw_all,translations_all
2,Baurac,baurac,"[Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac]",8,[B],[],[],"[{'entry_id': 'Ruland1612-Baurac', 'sense_idx': 1, 'def': None, 'raw': 'wird vom Schaum deß Glases oder Nitere.', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'wird vom Schaum deß Glases oder Nitere.'}]}, {'entry_id': 'Ruland1612-Baurac', 'sense_idx': 1, 'def': 'id est, sal gemmae.', 'raw': 'id est, sal gemmae.', 'translations': []}, {'entry_id': 'Ruland1612-Baurac', 'sense_idx': 1, 'def': 'i. e. id est sapphirium lithargyrum albificatum', 'raw': 'i. e. id est sapphiri...",8,"[wird vom Schaum deß Glases oder Nitere., id est, sal gemmae., i. e. id est sapphirium lithargyrum albificatum, id est, sal vitri; fex vitri, spuma vitri, fel vitri., id est, attinckar., id est, quodlibet genus salsuginis., ist Pflaster damit die Fugen vinbstrichen werden / als deß Golds mit Eyerclar und Mehl, ist auch gesaltzen nitrum Armenisch und schwefelisch]","[None, id est, sal gemmae., i. e. id est sapphirium lithargyrum albificatum, id est, sal vitri; fex vitri, spuma vitri, fel vitri., id est, attinckar., id est, quodlibet genus salsuginis., None, None]","[wird vom Schaum deß Glases oder Nitere., id est, sal gemmae., i. e. id est sapphirium lithargyrum albificatum, weiß gemacht: Glett., id est, sal vitri; fex vitri, spuma vitri, fel vitri., id est, attinckar., id est, quodlibet genus salsuginis., ist Pflaster damit die Fugen vinbstrichen werden / als deß Golds mit Eyerclar und Mehl, ist auch gesaltzen nitrum Armenisch und schwefelisch]","[{'entry_id': 'Ruland1612-Baurac', 'type': 'translation', 'xml_lang': 'de', 'quote': 'wird vom Schaum deß Glases oder Nitere.'}, {'entry_id': 'Ruland1612-Baurac', 'type': 'translation', 'xml_lang': 'de', 'quote': 'weiß gemacht: Glett'}, {'entry_id': 'Ruland1612-Baurac', 'type': 'translation', 'xml_lang': 'de', 'quote': 'ist Pflaster damit die Fugen vinbstrichen werden / als deß Golds mit Eyerclar und Mehl'}, {'entry_id': 'Ruland1612-Baurac', 'type': 'translation', 'xml_lang': 'de', 'quote': ..."
49,Bäurac,baurac,"[Ruland1612-Bäurac, Ruland1612-Bäurac]",2,[B],[],[],"[{'entry_id': 'Ruland1612-Bäurac', 'sense_idx': 1, 'def': 'id est, bores.', 'raw': 'id est, bores.', 'translations': []}, {'entry_id': 'Ruland1612-Bäurac', 'sense_idx': 1, 'def': 'est genus falis Alzedi, & diabetis', 'raw': 'est genus falis Alzedi, & diabetis, id est, testudo argenti viui', 'translations': []}]",2,"[id est, bores., est genus falis Alzedi, & diabetis]","[id est, bores., est genus falis Alzedi, & diabetis]","[id est, bores., est genus falis Alzedi, & diabetis, id est, testudo argenti viui]",[]


In [73]:
import pandas as pd
from itertools import chain
from typing import List, Any, Dict

def merge_by_target_canonical(lexeme_df: pd.DataFrame) -> pd.DataFrame:
    """
    Merge rows that share the same `target_canonical` into a single row.
    - Chooses a representative `Lemma` per canonical: highest entry_count, then shortest, then lexicographic
    - Flattens & de-duplicates list-like columns
    - Recomputes `entry_count` and `sense_count`
    - De-duplicates `translations_all` by (entry_id, type, xml_lang, quote)
    """

    def _uniq_order(seq: List[Any]) -> List[Any]:
        seen = set(); out = []
        for x in seq:
            if x is None:
                continue
            if isinstance(x, float) and pd.isna(x):
                continue
            if x not in seen:
                seen.add(x); out.append(x)
        return out

    def _as_list(x: Any) -> List[Any]:
        if isinstance(x, list):
            return x
        if x is None or (isinstance(x, float) and pd.isna(x)):
            return []
        return [x]

    def _flatten_listlike(series: pd.Series) -> List[Any]:
        return list(chain.from_iterable(_as_list(v) for v in series))

    groups: List[Dict[str, Any]] = []

    for canon, grp in lexeme_df.groupby("target_canonical", dropna=False):
        # Representative Lemma: highest entry_count -> shortest -> lexicographic
        grp2 = grp.assign(_len=grp["Lemma"].astype(str).str.len())
        rep = grp2.sort_values(
            by=["entry_count", "_len", "Lemma"],
            ascending=[False, True, True]
        ).iloc[0]

        lemmas_all = _uniq_order(grp["Lemma"].tolist())

        # Flatten list-like columns safely
        entry_ids            = _flatten_listlike(grp["entry_ids"])
        types                = _uniq_order(_flatten_listlike(grp["types"]))
        variants             = _uniq_order(_flatten_listlike(grp["variants"]))
        notes                = _uniq_order(_flatten_listlike(grp["notes"]))
        senses_flat          = _flatten_listlike(grp["senses_flat"])
        sense_def_all        = _flatten_listlike(grp["sense_def_all"])
        sense_def_strict_all = _flatten_listlike(grp["sense_def_strict_all"])
        sense_raw_all        = _flatten_listlike(grp["sense_raw_all"])
        translations_all     = _flatten_listlike(grp["translations_all"])

        # Deduplicate translations by (entry_id, type, xml_lang, quote)
        if translations_all:
            seen_t = set(); t_out = []
            for t in translations_all:
                key = (t.get("entry_id"), t.get("type"), t.get("xml_lang"), t.get("quote"))
                if key not in seen_t:
                    seen_t.add(key); t_out.append(t)
            translations_all = t_out

        groups.append({
            "Lemma": rep["Lemma"],
            "lemmas_all": lemmas_all,
            "target_canonical": canon,
            "entry_ids": entry_ids,
            "entry_count": len(entry_ids),
            "types": types,
            "variants": variants,
            "notes": notes,
            "senses_flat": senses_flat,
            "sense_count": len(senses_flat),
            "sense_def_all": sense_def_all,
            "sense_def_strict_all": sense_def_strict_all,
            "sense_raw_all": sense_raw_all,
            "translations_all": translations_all,
        })

    out = pd.DataFrame(groups).sort_values(
        ["sense_count", "entry_count", "target_canonical"],
        ascending=[False, False, True]
    ).reset_index(drop=True)

    return out

In [75]:
lexeme_df = merge_by_target_canonical(lexeme_df)

lexeme_df[lexeme_df.duplicated(subset="target_canonical", keep=False)]

Unnamed: 0,Lemma,lemmas_all,target_canonical,entry_ids,entry_count,types,variants,notes,senses_flat,sense_count,sense_def_all,sense_def_strict_all,sense_raw_all,translations_all


In [76]:
print(lexeme_df[lexeme_df["sense_count"]>1].sample(10, random_state=0))

               Lemma         lemmas_all target_canonical  \
120          Solutio          [Solutio]          solutio   
33   Quinta essentia  [Quinta essentia]  quinta essentia   
45           Asseres          [Asseres]          asseres   
126            Testa            [Testa]            testa   
83    Lapis Iudaicus   [Lapis Iudaicus]   lapis iudaicus   
8              Pilum            [Pilum]            pilum   
52            Cabala           [Cabala]           cabala   
104             Nuba             [Nuba]             nuba   
22            Iaspis           [Iaspis]           iaspis   
113          Pyrites          [Pyrites]          pyrites   

                                                                                entry_ids  \
120                                              [Ruland1612-Solutio, Ruland1612-Solutio]   
33   [Ruland1612-Quinta-essentia, Ruland1612-Quinta-essentia, Ruland1612-Quinta-essentia]   
45                                               [Ruland1612

In [77]:
len(lexeme_df)

2975

In [78]:
lexeme_df["target_len"] = lexeme_df["target_canonical"].str.split().str.len()

In [79]:
MAX_N = 3
lexeme_df = lexeme_df[lexeme_df["target_len"] <= MAX_N].copy()
len(lexeme_df)

2850

In [103]:
lexeme_df = lexeme_df[lexeme_df["target_relemmatized"].str.split().str.len() <= MAX_N]

In [104]:
lexeme_df.reset_index(drop=True, inplace=True)

In [105]:
lexeme_df["sense_count"].value_counts()

sense_count
1     2692
2       95
3       25
0       22
4       10
5        2
15       1
10       1
9        1
Name: count, dtype: int64

In [82]:
# for preprocessing the latin texts, we will use a module located outside of the current repository, specifically at the same level as the current project.
current_working_directory = os.getcwd()
relative_path = '../../latin-preprocessing/'  # change according to your location...
module_path = os.path.abspath(os.path.join(current_working_directory, relative_path))
if module_path not in sys.path:
    sys.path.insert(0, module_path)
# Now import the module
import tomela

In [83]:
doc = tomela.nlp("Lapis philosophorum")
lemmatized_string = " ".join([t.lemma_ for t in doc if t.pos_ not in ["PUNCT"]]).lower()
lemmatized_string

'lapis philosophus'

In [84]:
def lemmatizer(string):
    if (string is not None) and (len(string.split()) > 1):
        doc = tomela.nlp(string)
        lemmatized_list = []
        for t in doc:
            if t.pos_ not in ["PUNCT"]:
                if t.lemma_ == "":
                    lemmatized_list.append(t.text.lower())
                else:
                    lemmatized_list.append(t.lemma_.lower())
        lemmatized = " ".join(lemmatized_list)
    else:
        lemmatized = string.lower()
    if lemmatized == "":
        lemmatized = None
    return lemmatized
lexeme_df["target_relemmatized"] = lexeme_df["Lemma"].apply(lemmatizer)

In [85]:
lexeme_df.sample(20, random_state=0)

Unnamed: 0,Lemma,lemmas_all,target_canonical,entry_ids,entry_count,types,variants,notes,senses_flat,sense_count,sense_def_all,sense_def_strict_all,sense_raw_all,translations_all,target_len,target_relemmatized
793,Calor scobis,[Calor scobis],calor scobis,[Ruland1612-Calor-scobis],1,[C],[],[],"[{'entry_id': 'Ruland1612-Calor-scobis', 'sense_idx': 1, 'def': 'aut scoriae ferri superiore intensior & aperto vicinior est, quando ex scobe vel scoria ferri res propelluntur', 'raw': 'aut scoriae ferri superiore intensior & aperto vicinior est, quando ex scobe vel scoria ferri res propelluntur.', 'translations': []}]",1,"[aut scoriae ferri superiore intensior & aperto vicinior est, quando ex scobe vel scoria ferri res propelluntur]","[aut scoriae ferri superiore intensior & aperto vicinior est, quando ex scobe vel scoria ferri res propelluntur]","[aut scoriae ferri superiore intensior & aperto vicinior est, quando ex scobe vel scoria ferri res propelluntur.]",[],2,calor scobis
179,Adec,[Adec],adec,[Ruland1612-Adec],1,[A],[],[],"[{'entry_id': 'Ruland1612-Adec', 'sense_idx': 1, 'def': 'id est, lac acetosum', 'raw': 'id est, lac acetosum, Sawer Milch.', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'Sawer Milch'}]}]",1,"[id est, lac acetosum]","[id est, lac acetosum]","[id est, lac acetosum, Sawer Milch.]","[{'entry_id': 'Ruland1612-Adec', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Sawer Milch'}]",1,adec
667,Battitura aeris,[Battitura aeris],battitura aeris,[Ruland1612-Battitura-aeris],1,[B],[],[],"[{'entry_id': 'Ruland1612-Battitura-aeris', 'sense_idx': 1, 'def': 'id est, squama metallorum', 'raw': 'id est, squama metallorum; der Metallen Schupen. Graece, lepidos, Arabice Cubel, vel Tubel, vel fuligo, Germanice Hamerschlag', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'der Metallen Schupen'}, {'type': 'translation', 'xml_lang': 'de', 'quote': 'Hamerschlag'}]}]",1,"[id est, squama metallorum]","[id est, squama metallorum]","[id est, squama metallorum; der Metallen Schupen. Graece, lepidos, Arabice Cubel, vel Tubel, vel fuligo, Germanice Hamerschlag]","[{'entry_id': 'Ruland1612-Battitura-aeris', 'type': 'translation', 'xml_lang': 'de', 'quote': 'der Metallen Schupen'}, {'entry_id': 'Ruland1612-Battitura-aeris', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Hamerschlag'}]",2,battiturus aer
601,Aurum coticula experiri,[Aurum coticula experiri],aurum coticula experiri,"[Ruland1612-Aurum-coticula-experiri-,-Anstreichen-.]",1,[A],[],[],"[{'entry_id': 'Ruland1612-Aurum-coticula-experiri-,-Anstreichen-.', 'sense_idx': 1, 'def': None, 'raw': ', Anstreichen.', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'Anstreichen'}]}]",1,"[, Anstreichen.]",[None],"[, Anstreichen.]","[{'entry_id': 'Ruland1612-Aurum-coticula-experiri-,-Anstreichen-.', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Anstreichen'}]",3,aurum coticula experior
652,Balsamum,[Balsamum],balsamum,[Ruland1612-Balsamum],1,[B],[],[],"[{'entry_id': 'Ruland1612-Balsamum', 'sense_idx': 1, 'def': 'Balsamus est substantia corporum a putrefactione conseruans. Est internus & externus. Internus in homine est temperatissima quaedam substantia, non amara, non dulcis, non acerba, neque sal minerale, sed sal liquoris, quod potentissime a putrefactione praeseruat: dicit & naturę corporis gluten temperatissimum. Breuius sic: Est salis interioris liquor suum acorruptione corpus tutissime praeseruans naturaliter. Externus est terebinthi...",1,"[Balsamus est substantia corporum a putrefactione conseruans. Est internus & externus. Internus in homine est temperatissima quaedam substantia, non amara, non dulcis, non acerba, neque sal minerale, sed sal liquoris, quod potentissime a putrefactione praeseruat: dicit & naturę corporis gluten temperatissimum. Breuius sic: Est salis interioris liquor suum acorruptione corpus tutissime praeseruans naturaliter. Externus est terebinthina nullam vim ignis passa, sed digesta: Parac. I. de tereb. ...","[Balsamus est substantia corporum a putrefactione conseruans. Est internus & externus. Internus in homine est temperatissima quaedam substantia, non amara, non dulcis, non acerba, neque sal minerale, sed sal liquoris, quod potentissime a putrefactione praeseruat: dicit & naturę corporis gluten temperatissimum. Breuius sic: Est salis interioris liquor suum acorruptione corpus tutissime praeseruans naturaliter. Externus est terebinthina nullam vim ignis passa, sed digesta: Parac. I. de tereb. ...","[Balsamus est substantia corporum a putrefactione conseruans. Est internus & externus. Internus in homine est temperatissima quaedam substantia, non amara, non dulcis, non acerba, neque sal minerale, sed sal liquoris, quod potentissime a putrefactione praeseruat: dicit & naturę corporis gluten temperatissimum. Breuius sic: Est salis interioris liquor suum acorruptione corpus tutissime praeseruans naturaliter. Externus est terebinthina nullam vim ignis passa, sed digesta: Parac. I. de tereb. ...","[{'entry_id': 'Ruland1612-Balsamum', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Ein Erhalter aller Cos per fuͤr faulung ond zerbrechung: Ist zweyerley sein inwendiger ond außwendiger: Der inwend gisi ein temperirt Ding /w der sanri noch suͤßsein resoluirts Satz od Safft deß Saltze im Menschendaß für faͤu ung behuͤlet In natuͤrlichen Dingen ist daußwendig Baͤtzam der alle Co per fuͤr faͤulung vuͤz rstoͤrus gbeheits alß da Schwebel und dergleichen. Item ein jedes dist. Illi. tO.iauhein...",1,balsamum
2196,Quartura,[Quartura],quartura,"[Ruland1612-Quartura,-quartatio]",1,[Q],[quartatio],[],"[{'entry_id': 'Ruland1612-Quartura,-quartatio', 'sense_idx': 1, 'def': 'summum auri examen, hac via videlicet, vt argenti partes nouem, ad omnia auri commisceantur liquatione per ignem', 'raw': 'summum auri examen, hac via videlicet, vt argenti partes nouem, ad omnia auri commisceantur liquatione per ignem: deinde aqua stygia vel forti resoluantur ambo simul. Argentum totum in aquam confluit, auro solo subsidente pulueris instar spadicei nigricantisque coloris, Die hoͤchste Prob deß Goldes.'...",1,"[summum auri examen, hac via videlicet, vt argenti partes nouem, ad omnia auri commisceantur liquatione per ignem]","[summum auri examen, hac via videlicet, vt argenti partes nouem, ad omnia auri commisceantur liquatione per ignem]","[summum auri examen, hac via videlicet, vt argenti partes nouem, ad omnia auri commisceantur liquatione per ignem: deinde aqua stygia vel forti resoluantur ambo simul. Argentum totum in aquam confluit, auro solo subsidente pulueris instar spadicei nigricantisque coloris, Die hoͤchste Prob deß Goldes.]","[{'entry_id': 'Ruland1612-Quartura,-quartatio', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Die hoͤchste Prob deß Goldes.'}]",1,quartura
1652,Lamare,[Lamare],lamare,[Ruland1612-Lamare],1,[L],[],[],"[{'entry_id': 'Ruland1612-Lamare', 'sense_idx': 1, 'def': None, 'raw': 'ist Schwefel', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'ist Schwefel'}]}]",1,[ist Schwefel],[None],[ist Schwefel],"[{'entry_id': 'Ruland1612-Lamare', 'type': 'translation', 'xml_lang': 'de', 'quote': 'ist Schwefel'}]",1,lamare
2143,Praeseruatiua,[Praeseruatiua],praeseruatiua,[Ruland1612-Praeseruatiua],1,[P],[],[],"[{'entry_id': 'Ruland1612-Praeseruatiua', 'sense_idx': 1, 'def': 'sunt medicamenta vitam a suis inimicis & corruptionib. defendentia, die Ding so den Menschen vor Kranckheiten behuͤten.', 'raw': 'sunt medicamenta vitam a suis inimicis & corruptionib. defendentia, die Ding so den Menschen vor Kranckheiten behuͤten.', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'die Ding so den Menschen vor Kranckheiten behuͤten.'}]}]",1,"[sunt medicamenta vitam a suis inimicis & corruptionib. defendentia, die Ding so den Menschen vor Kranckheiten behuͤten.]","[sunt medicamenta vitam a suis inimicis & corruptionib. defendentia, die Ding so den Menschen vor Kranckheiten behuͤten.]","[sunt medicamenta vitam a suis inimicis & corruptionib. defendentia, die Ding so den Menschen vor Kranckheiten behuͤten.]","[{'entry_id': 'Ruland1612-Praeseruatiua', 'type': 'translation', 'xml_lang': 'de', 'quote': 'die Ding so den Menschen vor Kranckheiten behuͤten.'}]",1,praeseruatiua
1212,Eleuatio per latus,[Eleuatio per latus],eleuatio per latus,"[Ruland1612-Eleuatio-per-latus-est,-quando-humor-ex-vase-in-latus-inclinato-prolicitur.]",1,[E],[],[],"[{'entry_id': 'Ruland1612-Eleuatio-per-latus-est,-quando-humor-ex-vase-in-latus-inclinato-prolicitur.', 'sense_idx': 1, 'def': 'est, quando humor ex vase in latus inclinato prolicitur.', 'raw': 'est, quando humor ex vase in latus inclinato prolicitur.', 'translations': []}]",1,"[est, quando humor ex vase in latus inclinato prolicitur.]","[est, quando humor ex vase in latus inclinato prolicitur.]","[est, quando humor ex vase in latus inclinato prolicitur.]",[],3,eleuatio per latus
2552,Succubus,[Succubus],succubus,[Ruland1612-Succubus],1,[S],[],[],"[{'entry_id': 'Ruland1612-Succubus', 'sense_idx': 1, 'def': 'spiritus nocturnus muliebris, illudens hominibus inter dormiendum ac si rem haberent cum Venere', 'raw': 'spiritus nocturnus muliebris, illudens hominibus inter dormiendum ac si rem haberent cum Venere. Masculus imponens mulieribus incubus dicitur, der Nacht Geist, so die Frawen plagt: Incubus, Ephialtes, faunorum in quiete ludibrium, suppressio nocturna, pnigalion, pnigamon, Epibole, Epialus, lyphe, Euopa, lemures nocturni, nigri,...",1,"[spiritus nocturnus muliebris, illudens hominibus inter dormiendum ac si rem haberent cum Venere]","[spiritus nocturnus muliebris, illudens hominibus inter dormiendum ac si rem haberent cum Venere]","[spiritus nocturnus muliebris, illudens hominibus inter dormiendum ac si rem haberent cum Venere. Masculus imponens mulieribus incubus dicitur, der Nacht Geist, so die Frawen plagt: Incubus, Ephialtes, faunorum in quiete ludibrium, suppressio nocturna, pnigalion, pnigamon, Epibole, Epialus, lyphe, Euopa, lemures nocturni, nigri, Nachtfrawzunßler-Bitebaus Bommeter Werwolff/ Nachtmaͤnnlini Schroͤtiel die Marel Nachtmaͤnule.]",[],1,succubus


In [86]:
lexeme_df[lexeme_df["target_relemmatized"].str.contains("v")]

Unnamed: 0,Lemma,lemmas_all,target_canonical,entry_ids,entry_count,types,variants,notes,senses_flat,sense_count,sense_def_all,sense_def_strict_all,sense_raw_all,translations_all,target_len,target_relemmatized
37,Vectis,[Vectis],vectis,"[Ruland1612-Vectis, Ruland1612-Vectis, Ruland1612-Vectis]",3,[V],[],[],"[{'entry_id': 'Ruland1612-Vectis', 'sense_idx': 1, 'def': None, 'raw': 'Handthabe', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'Handthabe'}]}, {'entry_id': 'Ruland1612-Vectis', 'sense_idx': 1, 'def': None, 'raw': 'Haspelhorn', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'Haspelhorn'}]}, {'entry_id': 'Ruland1612-Vectis', 'sense_idx': 1, 'def': None, 'raw': 'Schien-Schienholtz.', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote'...",3,"[Handthabe, Haspelhorn, Schien-Schienholtz.]","[None, None, None]","[Handthabe, Haspelhorn, Schien-Schienholtz.]","[{'entry_id': 'Ruland1612-Vectis', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Handthabe'}, {'entry_id': 'Ruland1612-Vectis', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Haspelhorn'}, {'entry_id': 'Ruland1612-Vectis', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Schien-Schienholtz.'}]",1,vectis
38,Vrina,[Vrina],vrina,"[Ruland1612-Vrina, Ruland1612-Vrina, Ruland1612-Vrina]",3,[V],[],[],"[{'entry_id': 'Ruland1612-Vrina', 'sense_idx': 1, 'def': 'est sal resolutum, generatum in hepate descendens per suum emunctorium, vt superfluum excrementum salis a natura pulsum.', 'raw': 'est sal resolutum, generatum in hepate descendens per suum emunctorium, vt superfluum excrementum salis a natura pulsum.', 'translations': []}, {'entry_id': 'Ruland1612-Vrina', 'sense_idx': 1, 'def': 'vini est acetum, aut vrina hominis vinum perpetuo bibentis, Essig oder Harn eines Menschen, der Wein trinc...",3,"[est sal resolutum, generatum in hepate descendens per suum emunctorium, vt superfluum excrementum salis a natura pulsum., vini est acetum, aut vrina hominis vinum perpetuo bibentis, Essig oder Harn eines Menschen, der Wein trinckt., texi, Weinstein wasser]","[est sal resolutum, generatum in hepate descendens per suum emunctorium, vt superfluum excrementum salis a natura pulsum., vini est acetum, aut vrina hominis vinum perpetuo bibentis, Essig oder Harn eines Menschen, der Wein trinckt., texi, Weinstein wasser]","[est sal resolutum, generatum in hepate descendens per suum emunctorium, vt superfluum excrementum salis a natura pulsum., vini est acetum, aut vrina hominis vinum perpetuo bibentis, Essig oder Harn eines Menschen, der Wein trinckt., texi, Weinstein wasser. Vrina puerorum, Mercurius auß den Metallen gezogen. Vsfida, scoria auri, Goldt reinigung. Vsifur, Zinober auß sulphure und Mercurio. Vsifur sagen etlicheses sey Mininen / Vafur idem. Vsrub, Vrsub, Vzurup, id est, Saturnus. Vsurat, Zinn. V...","[{'entry_id': 'Ruland1612-Vrina', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Essig oder Harn eines Menschen, der Wein trinckt'}]",1,vrina
130,Vena,[Vena],vena,"[Ruland1612-Vena, Ruland1612-Vena]",2,[V],[],[],"[{'entry_id': 'Ruland1612-Vena', 'sense_idx': 1, 'def': None, 'raw': 'Fletz', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'Fletz'}]}, {'entry_id': 'Ruland1612-Vena', 'sense_idx': 1, 'def': None, 'raw': 'Gang', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'Gang'}]}]",2,"[Fletz, Gang]","[None, None]","[Fletz, Gang]","[{'entry_id': 'Ruland1612-Vena', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Fletz'}, {'entry_id': 'Ruland1612-Vena', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Gang'}]",1,vena
132,Venus,[Venus],venus,"[Ruland1612-Venus, Ruland1612-Venus]",2,[V],[],[],"[{'entry_id': 'Ruland1612-Venus', 'sense_idx': 1, 'def': 'cuprum murpur idem', 'raw': ', cuprum murpur idem.', 'translations': []}, {'entry_id': 'Ruland1612-Venus', 'sense_idx': 1, 'def': None, 'raw': 'der vnreine lapis, die Materi.', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'der vnreine lapis, die Materi.'}]}]",2,"[cuprum murpur idem, der vnreine lapis, die Materi.]","[cuprum murpur idem, None]","[, cuprum murpur idem., der vnreine lapis, die Materi.]","[{'entry_id': 'Ruland1612-Venus', 'type': 'translation', 'xml_lang': 'de', 'quote': 'der vnreine lapis, die Materi.'}]",1,venus
199,Aduersa venae pars,[Aduersa venae pars],aduersa venae pars,"[Ruland1612-Aduersa-venae-pars-,-Gegengrund-.]",1,[A],[],[],"[{'entry_id': 'Ruland1612-Aduersa-venae-pars-,-Gegengrund-.', 'sense_idx': 1, 'def': None, 'raw': 'Gegengrund', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'Gegengrund'}]}]",1,[Gegengrund],[None],[Gegengrund],"[{'entry_id': 'Ruland1612-Aduersa-venae-pars-,-Gegengrund-.', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Gegengrund'}]",3,aduerto vena pars
1369,Fornax prima vitrariorum,[Fornax prima vitrariorum],fornax prima vitrariorum,"[Ruland1612-Fornax-prima-vitrariorum-,-&-etiam-ea,-in-qua-excoquuntur-venae,-Schmeltzoffen-.]",1,[F],[],[],"[{'entry_id': 'Ruland1612-Fornax-prima-vitrariorum-,-&-etiam-ea,-in-qua-excoquuntur-venae,-Schmeltzoffen-.', 'sense_idx': 1, 'def': '& etiam ea, in qua excoquuntur venae,', 'raw': '& etiam ea, in qua excoquuntur venae, Schmeltzoffen.', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'Schmeltzoffen'}]}]",1,"[& etiam ea, in qua excoquuntur venae,]","[& etiam ea, in qua excoquuntur venae,]","[& etiam ea, in qua excoquuntur venae, Schmeltzoffen.]","[{'entry_id': 'Ruland1612-Fornax-prima-vitrariorum-,-&-etiam-ea,-in-qua-excoquuntur-venae,-Schmeltzoffen-.', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Schmeltzoffen'}]",3,fornax primus vitrariorum
2166,Pulverisiren,[Pulverisiren],pulverisiren,[Ruland1612-Pulverisiren],1,[P],[],[],"[{'entry_id': 'Ruland1612-Pulverisiren', 'sense_idx': 1, 'def': None, 'raw': 'zu Sand machen', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'zu Sand machen'}]}]",1,[zu Sand machen],[None],[zu Sand machen],"[{'entry_id': 'Ruland1612-Pulverisiren', 'type': 'translation', 'xml_lang': 'de', 'quote': 'zu Sand machen'}]",1,pulverisiren
2205,Qvandros lapis,[Qvandros lapis],qvandros lapis,[Ruland1612-QVandros-lapis],1,[Q],[],[],"[{'entry_id': 'Ruland1612-QVandros-lapis', 'sense_idx': 1, 'def': 'est seu gemma, quae reperitur in cerebro & capite vulturis, coloris candidi', 'raw': 'est seu gemma, quae reperitur in cerebro & capite vulturis, coloris candidi: quae replet mamillas lacte, & contra nociuos casus valere dicitur.', 'translations': []}]",1,"[est seu gemma, quae reperitur in cerebro & capite vulturis, coloris candidi]","[est seu gemma, quae reperitur in cerebro & capite vulturis, coloris candidi]","[est seu gemma, quae reperitur in cerebro & capite vulturis, coloris candidi: quae replet mamillas lacte, & contra nociuos casus valere dicitur.]",[],2,qvandrus lapis
2323,Sal vsuale,[Sal vsuale],sal vsuale,[Ruland1612-Sal-vsuale],1,[S],[],[],"[{'entry_id': 'Ruland1612-Sal-vsuale', 'sense_idx': 1, 'def': 'id est, Sal panis.', 'raw': 'id est, Sal panis.', 'translations': []}]",1,"[id est, Sal panis.]","[id est, Sal panis.]","[id est, Sal panis.]",[],2,sal vsualis
2562,Sulphur vnum,[Sulphur vnum],sulphur vnum,[Ruland1612-Sulphur-vnum],1,[S],[],[],"[{'entry_id': 'Ruland1612-Sulphur-vnum', 'sense_idx': 1, 'def': 'ex tribus illis primis omnium rerum, oleosum illud, quod ardet', 'raw': 'ex tribus illis primis omnium rerum, oleosum illud, quod ardet: der Schwefellwas Oelig, das in allen dingen brennt. Ist eins aus den Ersten.Sulphur vitriolatum, est sulphur a vitriolo decoctione tractum per aquam communem, cui innatat, Schwefel auß dem Vitriolgezogen.', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'Schwefel auß dem V...",1,"[ex tribus illis primis omnium rerum, oleosum illud, quod ardet]","[ex tribus illis primis omnium rerum, oleosum illud, quod ardet]","[ex tribus illis primis omnium rerum, oleosum illud, quod ardet: der Schwefellwas Oelig, das in allen dingen brennt. Ist eins aus den Ersten.Sulphur vitriolatum, est sulphur a vitriolo decoctione tractum per aquam communem, cui innatat, Schwefel auß dem Vitriolgezogen.]","[{'entry_id': 'Ruland1612-Sulphur-vnum', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Schwefel auß dem Vitriolgezogen'}]",2,sulphur vnus


In [97]:
lexeme_df["target_canonical"] = lexeme_df["target_canonical"].apply(lambda x: x.replace("v", "u").replace("ij", "ii"))
lexeme_df["target_relemmatized"] = lexeme_df["target_relemmatized"].apply(lambda x: x.replace("v", "u").replace("ij", "ii"))

In [106]:
# conservative under Google’s 50k hard limit
MAX_CELL = 48000

def _clip(s: str, n: int = MAX_CELL) -> str:
    return s if len(s) <= n else (s[:n-1] + "…")

def _stringify(v) -> str:
    """Controlled stringification so lists/dicts don't explode."""
    if v is None or (isinstance(v, float) and np.isnan(v)):
        return ""
    if isinstance(v, (list, dict)):
        # compact JSON, unicode-friendly
        return json.dumps(v, ensure_ascii=False, separators=(',', ':'))
    return str(v)

def prepare_for_sheets(df: pd.DataFrame, *, drop_heavy: bool = False) -> pd.DataFrame:
    """
    Make a sheets-safe copy:
      - optional: drop very heavy/nested cols
      - stringify lists/dicts in a compact way
      - clip any cell > MAX_CELL
    """
    if drop_heavy:
        df = df.drop(columns=[
            # keep this list lean; tweak as you like
            "senses_flat", "Senses", "Translations", "EntryLevelTranslations"
        ], errors="ignore").copy()
    else:
        df = df.copy()

    # stringify then clip per-cell
    df = df.applymap(_stringify)
    df = df.applymap(lambda s: _clip(s, MAX_CELL))
    return df

# --- example usage ---
# keep everything but serialize/clip:
df_for_sheet = prepare_for_sheets(lexeme_df, drop_heavy=False)

In [None]:
set_with_dataframe(ruland_gs.add_worksheet("lexeme_df_2025-11-07", 1,1), df_for_sheet)

In [None]:
lexeme_df.to_parquet("../data/ruland-dictionaries.parquet")

In [None]:
lexeme_df.to_csv("../data/ruland-dictionaries.csv")