In [1]:
import requests
import re
import pandas as pd
import xml.etree.ElementTree as ET
import json
import os
import sys
import matplotlib.pyplot as plt
from collections import Counter
import xml.etree.ElementTree as ET
from typing import List, Dict, Any, Optional

In [2]:
# Define the URL for the XML file
url = "https://raw.githubusercontent.com/sarahalang/alchemical-dictionaries/refs/heads/main/Ruland1612/Ruland.xml"

# Fetch the XML file from the URL
response = requests.get(url)

In [3]:
xml_content = response.content
tree = ET.ElementTree(ET.fromstring(xml_content))
root = tree.getroot()
TEI_NS = {'tei': 'http://www.tei-c.org/ns/1.0'}


In [4]:
entries = root.findall('.//tei:entry', TEI_NS)

In [78]:
import re
import pandas as pd

# --- cleaning helpers ---

def dehyphenate(s: str | None) -> str | None:
    if not isinstance(s, str):
        return s
    s = s.replace("\xad", "").replace("¬", "")
    # join likely line-wrap hyphens: letter - whitespace + lowercase letter
    s = re.sub(r'(?<=[A-Za-z])-\s+(?=[a-z])', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s or None

def norm_text(el) -> str | None:
    if el is None:
        return None
    txt = ''.join(el.itertext())
    txt = re.sub(r'\s+', ' ', txt).strip()
    return dehyphenate(txt)

def translations_under(parent):
    translations = []
    for cit in parent.findall('.//tei:cit', TEI_NS):
        q = cit.find('.//tei:quote', TEI_NS)
        quote = norm_text(q)
        if quote:
            translations.append({
                "type": cit.attrib.get('type'),
                "xml_lang": cit.attrib.get('{http://www.w3.org/XML/1998/namespace}lang'),
                "quote": quote
            })
    return translations

# --- lemma guessers ---

ID_HEAD_RE = re.compile(r'^[^-]+-(.+)$')  # after first dash
SENSE_HEAD_SPLIT = re.compile(r'\s*(?:,|;|\.|\bid est\b)\s*', flags=re.I)

def head_from_n(nval: str | None) -> str | None:
    if not nval:
        return None
    m = ID_HEAD_RE.match(nval)
    if not m:
        return None
    head = m.group(1)
    # cut at first comma-like delimiter
    head = head.split(',')[0]
    head = head.replace('-', ' ')
    return dehyphenate(head)

def head_from_sense(s_el) -> str | None:
    s = norm_text(s_el)
    if not s:
        return None
    # take leading chunk before first comma/semicolon/period or 'id est'
    head = SENSE_HEAD_SPLIT.split(s, maxsplit=1)[0]
    return dehyphenate(head)

STOP_LEMMA_TOKENS = {"vel", "id est", "idest", "i.e.", "sive"}

def pick_variant_head(variant_els):
    for v in variant_els:
        vt = norm_text(v)
        if not vt:
            continue
        # split on delimiters and pick first chunk not in stopwords
        head = re.split(r'\s*(?:,|;|\.)\s*', vt, maxsplit=1)[0].strip()
        if head and head.lower() not in STOP_LEMMA_TOKENS:
            return head
    return None

def ascii_fold(s):
    import unicodedata
    if s is None: return None
    return ''.join(
        c for c in unicodedata.normalize('NFKD', s)
        if not unicodedata.combining(c)
    ).lower()

def trim_trailing_punct(s: str | None) -> str | None:
    if not s:
        return s
    # remove only trailing commas, periods, semicolons, colons
    s = re.sub(r'[\s,;:.]+$', '', s)
    return s.strip() or None

def guess_lemma(entry, TEI_NS):
    lemma_el  = entry.find('.//tei:form[@type="lemma"]', TEI_NS)
    if (lemma := norm_text(lemma_el)):
        return lemma, "lemma"

    phrase_el = entry.find('.//tei:form[@type="phrase"]', TEI_NS)
    if (phrase := norm_text(phrase_el)):
        return phrase, "phrase"

    variant_els = entry.findall('.//tei:form[@type="variant"]', TEI_NS)
    if (v := pick_variant_head(variant_els)):
        return v, "variant"

    sense_el = entry.find('.//tei:sense', TEI_NS)
    if (sh := head_from_sense(sense_el)):
        return sh, "sense-head"

    nval = entry.attrib.get('n')
    if (nh := head_from_n(nval)):
        return nh, "n-head"

    return None, None

# --- SenseDef fallback helpers ---


CLAUSE_SPLIT = re.compile(r'\s*(?:[.;:](?:\s|$)|\bid est\b)\s*', flags=re.I)

def merge_defs_under(s_el) -> str | None:
    if s_el is None:
        return None
    defs = s_el.findall('.//tei:def', TEI_NS)
    parts = [norm_text(d) for d in defs]
    parts = [p for p in parts if p]
    if not parts:
        return None
    return dehyphenate(' ; '.join(parts))

def lead_text_before_first_cit(s_el, first_clause_only: bool = True) -> str | None:
    """
    Get plain text occurring *before* the first <cit> in a <sense>.
    Preserves comma appositions if 'id est' is present.
    """
    if s_el is None:
        return None
    pieces = []
    if s_el.text:
        pieces.append(s_el.text)
    for child in list(s_el):
        if child.tag == f"{{{TEI_NS['tei']}}}cit":
            break
        pieces.append(''.join(child.itertext()))
        if child.tail:
            pieces.append(child.tail)
    txt = re.sub(r'\s+', ' ', ''.join(pieces)).strip()
    txt = dehyphenate(txt)
    if not txt:                          # <- guard 1
        return None

    if first_clause_only:
        if re.search(r'\bid\s+est\b', txt, flags=re.I):
            txt = re.split(r'\s*[.;:](?:\s|$)', txt, maxsplit=1)[0]
        else:
            txt = CLAUSE_SPLIT.split(txt, maxsplit=1)[0]
        txt = dehyphenate(txt)
        if not txt:                      # <- guard 2
            return None

    # Strip leading punctuation ghosts
    txt = re.sub(r'^[\s,;:.]+', '', txt).strip()
    if not txt:                          # <- guard 3
        return None
    return txt

def extract_sense_def(s_el) -> str | None:
    """Prefer <def>; else the leading prose before the first <cit>, cleaned."""
    txt = merge_defs_under(s_el)
    if not txt:
        txt = lead_text_before_first_cit(s_el, first_clause_only=True)
    if not txt:                          # <- guard 4
        return None
    txt = re.sub(r'^[\s,;:.]+', '', txt).strip()
    return txt or None

# --- build one-row-per-entry with cleaned lists ---

rows = []
for entry in entries:
    entry_id   = entry.attrib.get('n')
    entry_type = entry.attrib.get('type')
    xml_id     = entry.attrib.get('{http://www.w3.org/XML/1998/namespace}id')

    lemma_el     = entry.find('.//tei:form[@type="lemma"]', TEI_NS)
    phrase_el    = entry.find('.//tei:form[@type="phrase"]', TEI_NS)
    variant_els  = entry.findall('.//tei:form[@type="variant"]', TEI_NS)
    note_els     = entry.findall('.//tei:note', TEI_NS)
    sense_els    = entry.findall('.//tei:sense', TEI_NS)

    # primary + fallback lemma + provenance
    lemma_primary = norm_text(lemma_el)
    lemma, lemma_source = (lemma_primary, "lemma") if lemma_primary else guess_lemma(entry, TEI_NS)
    lemma = trim_trailing_punct(lemma)
    phrase   = norm_text(phrase_el)
    variants = [norm_text(v) for v in variant_els]
    variants = [v for v in variants if v]
    notes    = [norm_text(n) for n in note_els]
    notes    = [n for n in notes if n]

    senses_list = []
    for i, s in enumerate(sense_els, start=1):
        sense_def  = extract_sense_def(s)   # <-- robust definition extraction
        sense_raw  = norm_text(s)
        sense_trans = translations_under(s)
        senses_list.append({
            "index": i,
            "def": sense_def,
            "raw": sense_raw,
            "translations": sense_trans
        })

    # entry-level translations (outside <sense>)
    entry_level_trans = []
    for xp in ['./tei:cit', './tei:dictScrap/tei:cit']:
        for c in entry.findall(xp, TEI_NS):
            q = c.find('.//tei:quote', TEI_NS)
            quote = norm_text(q)
            if quote:
                entry_level_trans.append({
                    "type": c.attrib.get('type'),
                    "xml_lang": c.attrib.get('{http://www.w3.org/XML/1998/namespace}lang'),
                    "quote": quote
                })

    rows.append({
        "ID": entry_id,
        "Type": entry_type,
        "XML_ID": xml_id,
        "Lemma": lemma,                               # could be guessed
        "LemmaSource": lemma_source,                  # provenance
        "LemmaCanonical": ascii_fold(lemma),
        "HasLemma": bool(lemma),
        "Phrase": phrase,
        "Variants": variants,
        "Notes": notes,
        "Senses": senses_list,
        "SenseCount": len(senses_list),
        "SenseDef": [s["def"] for s in senses_list],
        "SenseRaw": [s["raw"] for s in senses_list],
        "Translations": [s["translations"] for s in senses_list],  # list of lists
        "HasEntryLevelTranslations": bool(entry_level_trans),
        "EntryLevelTranslations": entry_level_trans
    })

entries_df = pd.DataFrame(rows)

# Optional: stable index per-lemma if you want to flag multiple entries later
entries_df["EntryIndexPerLemma"] = (
    entries_df.groupby("Lemma", dropna=False).cumcount() + 1
)

In [79]:
entries_df["Senses"].apply(len).value_counts()

Senses
1     3132
0       25
2        2
3        2
4        2
15       1
Name: count, dtype: int64

In [80]:
from collections import OrderedDict
import pandas as pd

def _uniq_order(seq):
    seen = OrderedDict()
    for x in seq:
        if x is not None and x not in seen:
            seen[x] = True
    return list(seen.keys())

def _flatten_list_series(series):
    """Flatten a Series of lists (possibly with Nones) into a single list."""
    out = []
    for v in series.tolist():
        if isinstance(v, list):
            out.extend([x for x in v if x is not None])
        elif pd.notna(v):
            out.append(v)
    return out

def _flatten_listoflists_series(series):
    """Flatten a Series of list-of-lists into a single list."""
    out = []
    for v in series.tolist():
        if isinstance(v, list):
            for inner in v:
                if isinstance(inner, list):
                    out.extend([x for x in inner if x is not None])
                elif inner is not None:
                    out.append(inner)
        elif v is not None:
            out.append(v)
    return out

def build_lexeme_df(entries_df: pd.DataFrame) -> pd.DataFrame:
    """
    Collapse entry-level rows into lexeme-level rows (one per Lemma),
    preserving senses and metadata across split entries.
    """
    # Ensure columns exist
    must_cols = ["ID","Lemma","LemmaCanonical","Type","Variants","Notes","Senses","SenseDef","SenseRaw","Translations","EntryLevelTranslations"]
    for c in must_cols:
        if c not in entries_df.columns:
            entries_df[c] = None

    def agg_types(series):
        return _uniq_order([x for x in series.tolist() if x])

    def agg_variants(series):
        out = []
        for v in series.tolist():
            if isinstance(v, list):
                out.extend([x for x in v if x])
        return _uniq_order(out)

    def agg_notes(series):
        out = []
        for v in series.tolist():
            if isinstance(v, list):
                out.extend([x for x in v if x])
        return _uniq_order(out)

    def agg_senses(group):
        senses = []
        for _, row in group.iterrows():
            eid = row["ID"]
            for sd in (row.get("Senses") or []):
                if not isinstance(sd, dict):
                    continue
                senses.append({
                    "entry_id": eid,
                    "sense_idx": sd.get("index"),
                    "def": sd.get("def"),
                    "raw": sd.get("raw"),
                    "translations": sd.get("translations"),
                })
        return senses

    def agg_translations(group):
        out = []
        for _, row in group.iterrows():
            eid = row["ID"]
            # entry-level translations
            for t in (row.get("EntryLevelTranslations") or []):
                if isinstance(t, dict):
                    out.append({"entry_id": eid, **t})
            # per-sense translations
            for sd in (row.get("Senses") or []):
                for t in (sd.get("translations") or []):
                    if isinstance(t, dict):
                        out.append({"entry_id": eid, **t})
        return out

    rows = []
    for (lemma, lemma_canon), grp in entries_df.groupby(["Lemma","LemmaCanonical"], dropna=False, as_index=False):
        entry_ids = grp["ID"].tolist()
        senses_flat = agg_senses(grp)
        rows.append({
            "Lemma": lemma,
            "LemmaCanonical": lemma_canon,
            "entry_ids": entry_ids,
            "entry_count": len(entry_ids),
            "types": agg_types(grp["Type"]),
            "variants": agg_variants(grp["Variants"]),
            "notes": agg_notes(grp["Notes"]),
            "senses_flat": senses_flat,
            "sense_count": len(senses_flat),
            # flatten Series-of-lists safely (no boolean 'or' on Series)
            "sense_def_all": _flatten_list_series(grp["SenseDef"]),
            "sense_raw_all": _flatten_list_series(grp["SenseRaw"]),
            # flattened list of dicts (entry-level + sense-level)
            "translations_all": agg_translations(grp),
        })

    lexeme_df = pd.DataFrame(rows).sort_values(
        ["sense_count","entry_count","LemmaCanonical"],
        ascending=[False, False, True]
    ).reset_index(drop=True)

    return lexeme_df

In [81]:
lexeme_df = build_lexeme_df(entries_df)
lexeme_df.head(10)


Unnamed: 0,Lemma,LemmaCanonical,entry_ids,entry_count,types,variants,notes,senses_flat,sense_count,sense_def_all,sense_raw_all,translations_all
0,Naphtha,naphtha,[Ruland1612-Naphtha],1,[N],[],[],"[{'entry_id': 'Ruland1612-Naphtha', 'sense_idx': 1, 'def': 'id est, pir, ignis. ; id est, flatus minerae, aurichalcum ; id est, nitrum, vel natron. ; est sal est ex humiditate nebulae saepius in p...",15,"[id est, pir, ignis. ; id est, flatus minerae, aurichalcum ; id est, nitrum, vel natron. ; est sal est ex humiditate nebulae saepius in pratis supra Ein saltz auß lapides decidentis, calore solis ...","[Iudaicum bitumen diximus esse speciem Naphthae, & non naphtham ipsam. Quoniam naphtha nihil aliud est, quam petroleum, oleum illud liquidum, quod ad nos, licet corruptum, ferunt agyrtae, Dioscor....","[{'entry_id': 'Ruland1612-Naphtha', 'type': 'translation', 'xml_lang': 'de', 'quote': 'die Steinkolen'}, {'entry_id': 'Ruland1612-Naphtha', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Mosch..."
1,Nitrum,nitrum,"[None, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum]",9,[N],[],[],"[{'entry_id': None, 'sense_idx': 1, 'def': 'Baurach, sal petrosum, nitrum à German', 'raw': 'Baurach, sal petrosum, nitrum à German. SalpeterBergsaltz /quasi sal petrae, cal. & siccum in 2. gradu....",9,"[Baurach, sal petrosum, nitrum à German, Fossile natiuum, est salsugo quaedam ex vrinarum congregationibus in sal praeparata., Sago, Tincar, id est, sal albus, scil. baurac.]","[Baurach, sal petrosum, nitrum à German. SalpeterBergsaltz /quasi sal petrae, cal. & siccum in 2. gradu. Estque nitrum res cognata sali, & quae est species salis. Hinc sal lucidum,, Fossile natiuu...","[{'entry_id': None, 'type': 'translation', 'xml_lang': 'de', 'quote': 'SalpeterBergsaltz'}, {'entry_id': 'Ruland1612-Nitrum', 'type': 'translation', 'xml_lang': 'de', 'quote': 'ist ein Geschlecht ..."
2,Baurac,baurac,"[Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac]",8,[B],[],[],"[{'entry_id': 'Ruland1612-Baurac', 'sense_idx': 1, 'def': None, 'raw': 'wird vom Schaum deß Glases oder Nitere.', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'wird vom Scha...",8,"[id est, sal gemmae., i. e. id est sapphirium lithargyrum albificatum, id est, sal vitri; fex vitri, spuma vitri, fel vitri., id est, attinckar., id est, quodlibet genus salsuginis.]","[wird vom Schaum deß Glases oder Nitere., id est, sal gemmae., i. e. id est sapphirium lithargyrum albificatum, weiß gemacht: Glett ., id est, sal vitri; fex vitri, spuma vitri, fel vitri., id est...","[{'entry_id': 'Ruland1612-Baurac', 'type': 'translation', 'xml_lang': 'de', 'quote': 'wird vom Schaum deß Glases oder Nitere.'}, {'entry_id': 'Ruland1612-Baurac', 'type': 'translation', 'xml_lang'..."
3,Magnesia,magnesia,"[Ruland1612-Magnesia, Ruland1612-Magnesia, Ruland1612-Magnesia, Ruland1612-Magnesia, Ruland1612-Magnesia]",5,[M],[],[],"[{'entry_id': 'Ruland1612-Magnesia', 'sense_idx': 1, 'def': 'id est, testudo vel sulphul.', 'raw': 'id est, testudo vel sulphul.', 'translations': []}, {'entry_id': 'Ruland1612-Magnesia', 'sense_i...",5,"[id est, testudo vel sulphul., Ist das gem schte Wasser im Lufft congelirt, daß dem Fewer widerstehet die Erde deß Steine sonser mercurius, mistio substantiarum, daß ganz darinn mercurius ist., Is...","[id est, testudo vel sulphul., Ist das gem schte Wasser im Lufft congelirt, daß dem Fewer widerstehet die Erde deß Steine sonser mercurius, mistio substantiarum, daß ganz darinn mercurius ist., Is...","[{'entry_id': 'Ruland1612-Magnesia', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Ist das gem schte Wasser im Lufft congelirt, daß dem Fewer widerstehet die Erde deß Steine sonser mercurius,..."
4,Aquila,aquila,"[Ruland1612-Aquila, Ruland1612-Aquila, Ruland1612-Aquila, Ruland1612-Aquila]",4,[A],[],[],"[{'entry_id': 'Ruland1612-Aquila', 'sense_idx': 1, 'def': 'quae auium regina est, vsurpatur nomine pro sale Armoniaco propter leuitatem in sublimationibus', 'raw': ', quae auium regina est, vsurpa...",4,"[quae auium regina est, vsurpatur nomine pro sale Armoniaco propter leuitatem in sublimationibus, id est, Arsenicum, vel sulphur., id est, aurum guttendo, fidelo, edel, sedalo]","[, quae auium regina est, vsurpatur nomine pro sale Armoniaco propter leuitatem in sublimationibus. Verum Paracel 46 LEXICON ALCHEMIAE MART. sus in multis accipi vult pro Mercurio praecipitato. Ei...","[{'entry_id': 'Ruland1612-Aquila', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Adler ist der Vogel Leim / Zaͤhschleim / der in der ersten'}, {'entry_id': 'Ruland1612-Aquila', 'type': 'trans..."
5,Borax,borax,"[Ruland1612-Borax, Ruland1612-Borax, Ruland1612-Borax, Ruland1612-Borax]",4,[B],[],[],"[{'entry_id': 'Ruland1612-Borax', 'sense_idx': 1, 'def': None, 'raw': 'ist chrysocolla, wie batrachium.', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'ist chrysocolla, wie ...",4,"[capistrum auri,, id est, effrenitrum attinckar, vel nitrone.]","[ist chrysocolla, wie batrachium., ist ein Gummi / darmit daß Silbervund Gold consolidirt wird. Et vocatur attinckar naturale., capistrum auri, Heist Arabisch Tinckar., id est, effrenitrum attinck...","[{'entry_id': 'Ruland1612-Borax', 'type': 'translation', 'xml_lang': 'de', 'quote': 'ist chrysocolla, wie batrachium.'}, {'entry_id': 'Ruland1612-Borax', 'type': 'translation', 'xml_lang': 'de', '..."
6,Digestio,digestio,"[Ruland1612-Digestio, Ruland1612-Digestio, Ruland1612-Digestio, Ruland1612-Digestio]",4,[D],[],[],"[{'entry_id': 'Ruland1612-Digestio', 'sense_idx': 1, 'def': None, 'raw': 'heist eine Enderung eines Dings in ein anders durch Ermallung vund Kochung der Natur.', 'translations': [{'type': 'transla...",4,"[est maturatio simplex, qua in calore digestorio res inconcoctae digeruntur. Id enim est digerere, ad modum digestionis naturalis ciborum in ventriculo, competente cuiuis calore, concoquere, & dis...","[heist eine Enderung eines Dings in ein anders durch Ermallung vund Kochung der Natur., est maturatio simplex, qua in calore digestorio res inconcoctae digeruntur. Id enim est digerere, ad modum d...","[{'entry_id': 'Ruland1612-Digestio', 'type': 'translation', 'xml_lang': 'de', 'quote': 'heist eine Enderung eines Dings in ein anders durch Ermallung vund Kochung der Natur.'}]"
7,Pilum,pilum,"[Ruland1612-Pilum, Ruland1612-Pilum, Ruland1612-Pilum, Ruland1612-Pilum]",4,[P],[],[],"[{'entry_id': 'Ruland1612-Pilum', 'sense_idx': 1, 'def': None, 'raw': 'Pompenstange.', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'Pompenstange.'}]}, {'entry_id': 'Ruland1...",4,"[pistillum,, tignum,, vel pili caput, quo franguntur panes aerei,]","[Pompenstange., pistillum, Stoͤsel ., tignum, Gestengt ., vel pili caput, quo franguntur panes aerei, Sufferbrecher .]","[{'entry_id': 'Ruland1612-Pilum', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Pompenstange.'}, {'entry_id': 'Ruland1612-Pilum', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Stoͤsel'},..."
8,Putrefactio,putrefactio,"[Ruland1612-Putrefactio, Ruland1612-Putrefactio, Ruland1612-Putrefactio, Ruland1612-Putrefactio]",4,[P],[],[],"[{'entry_id': 'Ruland1612-Putrefactio', 'sense_idx': 1, 'def': 'est misti resolutio per putredinem naturalem in calido humido', 'raw': 'est misti resolutio per putredinem naturalem in calido humid...",4,"[est misti resolutio per putredinem naturalem in calido humido, alia est ambigens inter calcinationem corrosiuam, & putrefactionem, vocaturque putrefactio sicca & Philosophica. Imo à quibus libet ...","[est misti resolutio per putredinem naturalem in calido humido. Humorem enim necesse est vincere terminans siccum, agente calore externo: quo facto, calor cognatus cum humido suo substantiali segr...","[{'entry_id': 'Ruland1612-Putrefactio', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Faͤulung ist wan es schwartz wird als dann stincktes auch wie ein Mist ond Aaß /ond hie geschicht die war..."
9,Sulphur,sulphur,"[Ruland1612-Sulphur, Ruland1612-Sulphur, Ruland1612-Sulphur, Ruland1612-Sulphur]",4,[S],[],[],"[{'entry_id': 'Ruland1612-Sulphur', 'sense_idx': 1, 'def': 'chibur, vel, Albusao Arabice dicitur, pars lapidis Philosophorum apud Chymistas, principium, & pater metallorum, calidum & siccum quarto...",4,"[chibur, vel, Albusao Arabice dicitur, pars lapidis Philosophorum apud Chymistas, principium, & pater metallorum, calidum & siccum quarto gradu, a Germanis, est principium formatiuum, aereum parti...","[chibur, vel, Albusao Arabice dicitur, pars lapidis Philosophorum apud Chymistas, principium, & pater metallorum, calidum & siccum quarto gradu. Estque nihil aliud, quam pinguedo terrae, per tempe...","[{'entry_id': 'Ruland1612-Sulphur', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Schwefel'}, {'entry_id': 'Ruland1612-Sulphur', 'type': 'translation', 'xml_lang': 'de', 'quote': 'ist der Saa..."


In [82]:
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', 500)

In [83]:
print(lexeme_df.sample(10))

                                     Lemma  \
1898                              Magnalia   
1337                            Ferramenta   
845                      Canales recludere   
805   Callecamenon, Calcucementum casticum   
646                                  Azoch   
1488                         GAgates lapis   
1421                     Fornacis magister   
267                                 Alcadp   
1408                                   Fom   
1300                        Eurnus simplex   

                            LemmaCanonical  \
1898                              magnalia   
1337                            ferramenta   
845                      canales recludere   
805   callecamenon, calcucementum casticum   
646                                  azoch   
1488                         gagates lapis   
1421                     fornacis magister   
267                                 alcadp   
1408                                   fom   
1300                        eurnu

In [44]:
# Entries with no literal <lemma> but with guessed Lemma
entries_df[entries_df["LemmaSource"] != "lemma"][["ID","Lemma","LemmaSource"]].head(20)

# A few of the examples you pasted
mask = entries_df["ID"].isin([
    "Ruland1612-Adibisi-vel-Adebezi-,-id-est,-testudo,-Schneck-.",
    "Ruland1612-Aes-rude-natiuum,-minus-syncerum,-Vnrein-gediegen-Kupffer-.",
    "Ruland1612-Quod-iam-suum-est,-ex-Morauia,-Rein-gediegen-Kupffer-aus-Mehren-."
])
entries_df.loc[mask, ["ID","Lemma","LemmaSource","Variants","SenseDef","SenseRaw"]]

Unnamed: 0,ID,Lemma,LemmaSource,Variants,SenseDef,SenseRaw
61,"Ruland1612-Adibisi-vel-Adebezi-,-id-est,-testu...",Adibisi,variant,"[Adibisi, vel Adebezi]","[id est, testudo]","[id est, testudo, Schneck .]"
98,"Ruland1612-Aes-rude-natiuum,-minus-syncerum,-V...",Aes rude natiuum,sense-head,[],"[Aes rude natiuum, minus syncerum,]","[Aes rude natiuum, minus syncerum, Vnrein gedi..."
99,"Ruland1612-Quod-iam-suum-est,-ex-Morauia,-Rein...",Quod iam suum est,sense-head,[],"[Quod iam suum est, ex Morauia,]","[Quod iam suum est, ex Morauia, Rein gediegen ..."


In [45]:
# for preprocessing the latin texts, we will use a module located outside of the current repository, specifically at the same level as the current project.
current_working_directory = os.getcwd()
relative_path = '../../latin-preprocessing/'  # change according to your location...
module_path = os.path.abspath(os.path.join(current_working_directory, relative_path))
if module_path not in sys.path:
    sys.path.insert(0, module_path)
# Now import the module
import tomela

In [46]:
doc = tomela.nlp("Merucurius metallorum")
lemmatized_string = " ".join([t.lemma_ for t in doc if t.pos_ not in ["PUNCT"]]).lower()
lemmatized_string

'merucurius metallum'

In [47]:
def lemmatizer(string):
    if (string is not None) and (len(string.split()) > 2):
        doc = tomela.nlp(string)
        lemmatized_list = []
        for t in doc:
            if t.pos_ not in ["PUNCT"]:
                if t.lemma_ == "":
                    lemmatized_list.append(t.text.lower())
                else:
                    lemmatized_list.append(t.lemma_.lower())
        lemmatized = " ".join(lemmatized_list)
    else:
        lemmatized = string.lower()
    if lemmatized == "":
        lemmatized = None
    return lemmatized
entries_df["relemmatized"] = entries_df["Lemma"].apply(lemmatizer)

In [52]:
print(entries_df[entries_df["Lemma"]=="Mercurius"])

                        ID Type XML_ID      Lemma LemmaSource LemmaCanonical  \
2049  Ruland1612-Mercurius    M   None  Mercurius       lemma      mercurius   
2057  Ruland1612-Mercurius    M   None  Mercurius       lemma      mercurius   
2058  Ruland1612-Mercurius    M   None  Mercurius       lemma      mercurius   

     Phrase Variants Notes                                             Senses  \
2049   None       []    []  [{'index': 1, 'def': 'id est, sulphur.', 'raw'...   
2057   None       []    []  [{'index': 1, 'def': 'est principium materiale...   
2058   None       []    []  [{'index': 1, 'def': 'argentum viuum CC', 'raw...   

                                               SenseDef  \
2049                                 [id est, sulphur.]   
2057  [est principium materiale, vaporosum, naturę a...   
2058                                [argentum viuum CC]   

                                               SenseRaw  \
2049  [id est, sulphur. Mercurius. Mercurius ist in ...   

In [63]:
print(entries_df[entries_df["Senses"].apply(len) > 1])

                         ID Type XML_ID       Lemma LemmaSource  \
292     Ruland1612-Albertus    A   None    Albertus       lemma   
353           Ruland1612-An    A   None          An       lemma   
354       Ruland1612-Anaton    A   None      Anaton       lemma   
650      Ruland1612-Bitumen    B   None     Bitumen       lemma   
934   Ruland1612-Cinnabaris    C   None  Cinnabaris       lemma   
1137   Ruland1612-Descensio    D   None   Descensio       lemma   
2151     Ruland1612-Naphtha    N   None     Naphtha       lemma   

     LemmaCanonical                   Phrase            Variants  \
292        albertus                     None                  []   
353              an                     None  [Anfir, Anfirarto]   
354          anaton                     None                  []   
650         bitumen  Bitumen sulphurea terra                  []   
934      cinnabaris                     None                  []   
1137      descensio                     None           

In [18]:
entries_df[entries_df.duplicated(subset=["target_lemma"])]

Unnamed: 0,ID,Type,XML_ID,Lemma,Phrase,Variants,Notes,Senses,SenseDef,SenseRaw,Translations,EntryLevelTranslations,target_lemma
42,Ruland1612-Adamas,A,,Adamas,,[],"[Indica non nascens in auro, cognatus colore C...","[{'index': 1, 'def': 'Demanth/Demuth nascitur ...","[Demanth/Demuth nascitur & extra aurum, & in a...","[Demanth/Demuth nascitur & extra aurum, & in a...","[[{'type': 'translation', 'xml_lang': 'de', 'q...",[],adamas
44,"Ruland1612-A-,-Ein-Diamanttaffel-.",A,,,A,[],[],"[{'index': 1, 'def': None, 'raw': ', Ein Diama...",[None],"[, Ein Diamanttaffel .]","[[{'type': 'translation', 'xml_lang': 'de', 'q...",[],
61,"Ruland1612-Adibisi-vel-Adebezi-,-id-est,-testu...",A,,,,"[Adibisi, vel Adebezi]",[],"[{'index': 1, 'def': 'id est, testudo', 'raw':...","[id est, testudo]","[id est, testudo, Schneck .]","[[{'type': 'translation', 'xml_lang': 'de', 'q...",[],
72,"Ruland1612-Aduersa-venae-pars-,-Gegengrund-.",A,,,Aduersa venae pars,[],[],"[{'index': 1, 'def': None, 'raw': 'Gegengrund'...",[None],[Gegengrund],"[[{'type': 'translation', 'xml_lang': 'de', 'q...",[],
74,"Ruland1612-Aes-seu-Cuprum-à-Germanis,-Kupffer-...",A,,,,[],[],"[{'index': 1, 'def': None, 'raw': 'Aes seu Cup...",[None],"[Aes seu Cuprum à Germanis, Kupffer / a Chymis...","[[{'type': None, 'xml_lang': 'de', 'quote': 'K...",[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3050,Ruland1612-Venam,V,,Venam,,[],[],"[{'index': 1, 'def': None, 'raw': 'qua parte a...",[None],"[qua parte abundat metallo, luto oblinire, Ver...","[[{'type': 'translation', 'xml_lang': 'de', 'q...",[],uena
3056,Ruland1612-Venus,V,,Venus,,[],[],"[{'index': 1, 'def': None, 'raw': 'der vnreine...",[None],"[der vnreine lapis, die Materi.]","[[{'type': 'translation', 'xml_lang': 'de', 'q...",[],uenus
3078,"Ruland1612-Viscus-secundae-generationis-,-est-...",V,,,Viscus secundae generationis,[],[],"[{'index': 1, 'def': 'est cruor, qui descendit...","[est cruor, qui descendit a salibus.]","[est cruor, qui descendit a salibus.]",[[]],[],
3110,Ruland1612-Vrina,V,,Vrina,,[],[],"[{'index': 1, 'def': 'vini est acetum, aut vri...","[vini est acetum, aut vrina hominis vinum perp...","[vini est acetum, aut vrina hominis vinum perp...","[[{'type': 'translation', 'xml_lang': 'de', 'q...",[],urina


In [38]:
entries_df.to_parquet("../data/ruland-dictionaries.parquet")