In [94]:
import requests
import re
import pandas as pd
import xml.etree.ElementTree as ET
import json
import os
import sys
import matplotlib.pyplot as plt
from collections import Counter
import xml.etree.ElementTree as ET
from typing import List, Dict, Any, Optional
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', 500)

In [2]:
# Define the URL for the XML file
url = "https://raw.githubusercontent.com/sarahalang/alchemical-dictionaries/refs/heads/main/Ruland1612/Ruland.xml"

# Fetch the XML file from the URL
response = requests.get(url)

In [3]:
xml_content = response.content
tree = ET.ElementTree(ET.fromstring(xml_content))
root = tree.getroot()
TEI_NS = {'tei': 'http://www.tei-c.org/ns/1.0'}


In [4]:
entries = root.findall('.//tei:entry', TEI_NS)

In [109]:
import re
import unicodedata
import pandas as pd

# ---------------------------
# Compact cleaning & getters
# ---------------------------

P_LEADPUNCT = re.compile(r'^[\s,;:.]+')
P_JOIN_WRAP = re.compile(r'(?<=[A-Za-z])-\s+(?=[a-z])')   # ar- gentum -> argentum
P_WS        = re.compile(r'\s+')
P_CLAUSE    = re.compile(r'\s*(?:[.;:](?:\s|$)|\bid est\b)\s*', flags=re.I)

def clean(s: str | None) -> str | None:
    """Dehyphenate, normalize whitespace, and tidy punctuation spacing."""
    if not isinstance(s, str):
        return None
    # remove soft hyphen & wrap markers
    s = s.replace("\xad", "").replace("¬", "")
    # join likely line-wrap hyphens (ar- gentum -> argentum)
    s = P_JOIN_WRAP.sub('', s)
    # normalize whitespace early
    s = P_WS.sub(' ', s).strip()
    # punctuation tidy:
    #  - remove space before . , ; : ! ?
    s = re.sub(r'\s+([.,;:!?])', r'\1', s)
    #  - ensure a space after comma when missing (letters only, conservative, unicode-safe; don't touch decimals)
    s = re.sub(r'(?<!\d),([^\W\d_])', r', \1', s)
    #  - fix brackets spacing
    s = re.sub(r'([(\[])\s+', r'\1', s)   # no space after opening
    s = re.sub(r'\s+([)\]])', r'\1', s)   # no space before closing
    # re-normalize spaces
    s = P_WS.sub(' ', s).strip()
    return s or None

def text(el) -> str | None:
    """Flatten element itertext and clean."""
    if el is None:
        return None
    return clean(''.join(el.itertext()))

def ascii_fold(s: str | None) -> str | None:
    if s is None:
        return None
    s_nfkd = unicodedata.normalize('NFKD', s)
    return ''.join(c for c in s_nfkd if not unicodedata.combining(c)).lower()

def translations_under(parent):
    return [
        {
            "type": cit.attrib.get('type'),
            "xml_lang": cit.attrib.get('{http://www.w3.org/XML/1998/namespace}lang'),
            "quote": q
        }
        for cit in parent.findall('.//tei:cit', TEI_NS)
        for qel in [cit.find('.//tei:quote', TEI_NS)]
        for q in [text(qel)]
        if q
    ]

# ---------------------------
# Lemma guessers (compact)
# ---------------------------

ID_HEAD_RE = re.compile(r'^[^-]+-(.+)$')           # after first dash
STOP_LEMMA_TOKENS = {"vel", "id est", "idest", "i.e.", "sive"}

def head_from_n(nval: str | None) -> str | None:
    if not nval: return None
    m = ID_HEAD_RE.match(nval)
    if not m: return None
    head = m.group(1).split(',', 1)[0].replace('-', ' ')
    return clean(head)

def head_from_sense(s_el) -> str | None:
    s = text(s_el)
    if not s: return None
    # take leading chunk before comma/semicolon/period *or* 'id est'
    head = re.split(r'\s*(?:,|;|\.|\bid est\b)\s*', s, maxsplit=1, flags=re.I)[0]
    return clean(head)

def pick_variant_head(variant_els):
    for v in variant_els:
        vt = text(v)
        if not vt:
            continue
        head = re.split(r'\s*(?:,|;|\.)\s*', vt, maxsplit=1)[0].strip()
        if head and head.lower() not in STOP_LEMMA_TOKENS:
            return head
    return None

def guess_lemma(entry):
    lemma_el  = entry.find('.//tei:form[@type="lemma"]', TEI_NS)
    if (lemma := text(lemma_el)):                 return lemma, "lemma"

    phrase_el = entry.find('.//tei:form[@type="phrase"]', TEI_NS)
    if (phrase := text(phrase_el)):               return phrase, "phrase"

    var_els   = entry.findall('.//tei:form[@type="variant"]', TEI_NS)
    if (v := pick_variant_head(var_els)):         return v, "variant"

    sense_el  = entry.find('.//tei:sense', TEI_NS)
    if (sh := head_from_sense(sense_el)):         return sh, "sense-head"

    nval = entry.attrib.get('n')
    if (nh := head_from_n(nval)):                 return nh, "n-head"

    return None, None

# ---------------------------
# Sense definition extraction
# ---------------------------

def merge_defs_under(s_el) -> str | None:
    defs = s_el.findall('.//tei:def', TEI_NS) if s_el is not None else []
    parts = [text(d) for d in defs]
    parts = [p for p in parts if p]
    if not parts:
        return None
    return clean(' ; '.join(parts))

def prose_before_first_cit(s_el) -> str | None:
    """Plain text before first <cit>; keep comma appositions when 'id est' is present."""
    if s_el is None:
        return None
    pieces = []
    if s_el.text: pieces.append(s_el.text)
    for child in list(s_el):
        if child.tag == f"{{{TEI_NS['tei']}}}cit":
            break
        pieces.append(''.join(child.itertext()))
        if child.tail:
            pieces.append(child.tail)
    s = clean(''.join(pieces))
    if not s:
        return None
    # If 'id est' present, only cut at sentence-ending punct (.;:)
    if re.search(r'\bid\s+est\b', s, flags=re.I):
        s = re.split(r'\s*[.;:](?:\s|$)', s, maxsplit=1)[0]
    else:
        s = P_CLAUSE.split(s, maxsplit=1)[0]
    s = clean(s)
    if not s:
        return None
    s = P_LEADPUNCT.sub('', s).strip()
    return s or None

def extract_sense_def(s_el) -> str | None:
    return (merge_defs_under(s_el) or prose_before_first_cit(s_el)) or None

# ---------------------------
# Build entries_df (1 row per TEI <entry>)
# ---------------------------

def build_entries_df(entries):
    rows = []
    for entry in entries:
        entry_id   = entry.attrib.get('n')
        entry_type = entry.attrib.get('type')
        xml_id     = entry.attrib.get('{http://www.w3.org/XML/1998/namespace}id')

        lemma_primary = text(entry.find('.//tei:form[@type="lemma"]', TEI_NS))
        lemma, lemma_src = (lemma_primary, "lemma") if lemma_primary else guess_lemma(entry)
        # trim trivial trailing punct on the lemma only
        if lemma:
            lemma = re.sub(r'[\s,;:.]+$', '', lemma).strip() or None

        variants = [t for t in (text(v) for v in entry.findall('.//tei:form[@type="variant"]', TEI_NS)) if t]
        notes    = [t for t in (text(n) for n in entry.findall('.//tei:note', TEI_NS)) if t]

        senses_list = []
        for i, s in enumerate(entry.findall('.//tei:sense', TEI_NS), start=1):
            senses_list.append({
                "index": i,
                "def": extract_sense_def(s),
                "raw": text(s),
                "translations": translations_under(s)
            })

        entry_level_trans = [
            {
                "type": c.attrib.get('type'),
                "xml_lang": c.attrib.get('{http://www.w3.org/XML/1998/namespace}lang'),
                "quote": q
            }
            for xp in ['./tei:cit', './tei:dictScrap/tei:cit']
            for c in entry.findall(xp, TEI_NS)
            for qel in [c.find('.//tei:quote', TEI_NS)]
            for q in [text(qel)]
            if q
        ]

        rows.append({
            "ID": entry_id,
            "Type": entry_type,
            "XML_ID": xml_id,
            "Lemma": lemma,
            "LemmaSource": lemma_src,
            "LemmaCanonical": ascii_fold(lemma),
            "Phrase": text(entry.find('.//tei:form[@type="phrase"]', TEI_NS)),
            "Variants": variants,
            "Notes": notes,
            "Senses": senses_list,
            "SenseDef": [s["def"] for s in senses_list],
            "SenseRaw": [s["raw"] for s in senses_list],
            "Translations": [s["translations"] for s in senses_list],  # list-of-lists
            "EntryLevelTranslations": entry_level_trans,
            "SenseCount": len(senses_list),
        })

    df = pd.DataFrame(rows)
    df["EntryIndexPerLemma"] = df.groupby("Lemma", dropna=False).cumcount() + 1
    return df

In [110]:
entries_df = build_entries_df(entries)

In [111]:

# ---------------------------
# Build lexeme_df (1 row per lemma)
# ---------------------------

def _uniq(seq):
    seen = set(); out=[]
    for x in seq:
        if x is not None and x not in seen:
            seen.add(x); out.append(x)
    return out

def build_lexeme_df(entries_df: pd.DataFrame) -> pd.DataFrame:
    g = entries_df.groupby(["Lemma","LemmaCanonical"], dropna=False)

    rows = []
    for (lemma, lemma_canon), grp in g:
        entry_ids = grp["ID"].tolist()
        # flatten senses (tag with entry_id)
        senses_flat = []
        for _, r in grp.iterrows():
            for s in (r["Senses"] or []):
                senses_flat.append({
                    "entry_id": r["ID"],
                    "sense_idx": s.get("index"),
                    "def": s.get("def"),
                    "raw": s.get("raw"),
                    "translations": s.get("translations"),
                })

        rows.append({
            "Lemma": lemma,
            "LemmaCanonical": lemma_canon,
            "entry_ids": entry_ids,
            "entry_count": len(entry_ids),
            "types": _uniq(grp["Type"].tolist()),
            "variants": _uniq([v for lst in grp["Variants"].tolist() if isinstance(lst, list) for v in lst]),
            "notes": _uniq([n for lst in grp["Notes"].tolist()    if isinstance(lst, list) for n in lst]),
            "senses_flat": senses_flat,
            "sense_count": len(senses_flat),
            "sense_def_all": [x for lst in grp["SenseDef"].tolist() if isinstance(lst, list) for x in lst if x],
            "sense_raw_all": [x for lst in grp["SenseRaw"].tolist() if isinstance(lst, list) for x in lst if x],
            "translations_all": [
                { "entry_id": r["ID"], **t }
                for _, r in grp.iterrows()
                for t in (r["EntryLevelTranslations"] or [])
                if isinstance(t, dict)
            ] + [
                { "entry_id": r["ID"], **t }
                for _, r in grp.iterrows()
                for s in (r["Senses"] or [])
                for t in (s.get("translations") or [])
                if isinstance(t, dict)
            ],
        })

    return pd.DataFrame(rows).sort_values(
        ["sense_count","entry_count","LemmaCanonical"], ascending=[False, False, True]
    ).reset_index(drop=True)


In [118]:
lexeme_df  = build_lexeme_df(entries_df)
lexeme_df.head(5)

Unnamed: 0,Lemma,LemmaCanonical,entry_ids,entry_count,types,variants,notes,senses_flat,sense_count,sense_def_all,sense_raw_all,translations_all
0,Naphtha,naphtha,[Ruland1612-Naphtha],1,[N],[],[],"[{'entry_id': 'Ruland1612-Naphtha', 'sense_idx': 1, 'def': 'id est, pir, ignis.; id est, flatus minerae, aurichalcum; id est, nitrum, vel natron.; est sal est ex humiditate nebulae saepius in pratis supra Ein saltz auß lapides decidentis, calore solis induratum. Feuchtigkeit deß Nebelo sich setzent vnd von der Sonnen erhartend.; sunt medicamenta mortem pellentia, & vitam conseruantia, Necrolica.; est ars illicita, quę cum mortuis operabatur olim, vt cum astra manifestabantur apud mortuos.; e...",15,"[id est, pir, ignis.; id est, flatus minerae, aurichalcum; id est, nitrum, vel natron.; est sal est ex humiditate nebulae saepius in pratis supra Ein saltz auß lapides decidentis, calore solis induratum. Feuchtigkeit deß Nebelo sich setzent vnd von der Sonnen erhartend.; sunt medicamenta mortem pellentia, & vitam conseruantia, Necrolica.; est ars illicita, quę cum mortuis operabatur olim, vt cum astra manifestabantur apud mortuos.; est pellicula, vel oculis, vel auriculis, infantum adnascens...","[Iudaicum bitumen diximus esse speciem Naphthae, & non naphtham ipsam. Quoniam naphtha nihil aliud est, quam petroleum, oleum illud liquidum, quod ad nos, licet corruptum, ferunt agyrtae, Dioscor. lib. I. cap. 84. naphtham a Babyloniis vocari dicit bituminis colamen, & esse seu inueniri candidam & nigram. Naphtha candida est petroleum nostrum. Nigra, quae est forsitan illa Amiani pisca & glutinosa, bitumme persimilis, quae flagrans nulla alia re exstingui potest, quam puluere iniecto. Sicuti...","[{'entry_id': 'Ruland1612-Naphtha', 'type': 'translation', 'xml_lang': 'de', 'quote': 'die Steinkolen'}, {'entry_id': 'Ruland1612-Naphtha', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Mosch.'}, {'entry_id': 'Ruland1612-Naphtha', 'type': 'translation', 'xml_lang': 'de', 'quote': 'was Schmertzen stillet vnd onempfindtlich macht'}, {'entry_id': 'Ruland1612-Naphtha', 'type': 'translation', 'xml_lang': 'de', 'quote': 'also genandt.'}, {'entry_id': 'Ruland1612-Naphtha', 'type': 'translation..."
1,Nitrum,nitrum,"[None, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum, Ruland1612-Nitrum]",9,[N],[],[],"[{'entry_id': None, 'sense_idx': 1, 'def': 'Baurach, sal petrosum, nitrum à German', 'raw': 'Baurach, sal petrosum, nitrum à German. SalpeterBergsaltz /quasi sal petrae, cal. & siccum in 2. gradu. Estque nitrum res cognata sali, & quae est species salis. Hinc sal lucidum,', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'SalpeterBergsaltz'}]}, {'entry_id': 'Ruland1612-Nitrum', 'sense_idx': 1, 'def': 'Fossile natiuum', 'raw': 'Fossile natiuum. Huius species similes sunt f...",9,"[Baurach, sal petrosum, nitrum à German, Fossile natiuum, est salsugo quaedam ex vrinarum congregationibus in sal praeparata., Sago, Tincar, id est, sal albus, scil. baurac.]","[Baurach, sal petrosum, nitrum à German. SalpeterBergsaltz /quasi sal petrae, cal. & siccum in 2. gradu. Estque nitrum res cognata sali, & quae est species salis. Hinc sal lucidum,, Fossile natiuum. Huius species similes sunt fali fossili. Sunt aut burach. Artificiale fastitiu, ficuti ipse flos parietis, seu petrae, vel flos falis. Vide denillo etiam Gal. Coquitur hoc ex sale vel aqua falis. 1. Armenum, ab armenta patria, & optimum. 2. Ab burach. album simile omnino fali fossili, saporem hab...","[{'entry_id': None, 'type': 'translation', 'xml_lang': 'de', 'quote': 'SalpeterBergsaltz'}, {'entry_id': 'Ruland1612-Nitrum', 'type': 'translation', 'xml_lang': 'de', 'quote': 'ist ein Geschlecht Saltzes auch ist es borax, vnd etli herley Geschlecht armeni, etliches schwartz reiliches roth etliches saphirinum.'}, {'entry_id': 'Ruland1612-Nitrum', 'type': 'translation', 'xml_lang': 'de', 'quote': 'vnd etli herley Geschlecht armeni, etliches schwartz reiliches roth etliches'}, {'entry_id': 'Ru..."
2,Baurac,baurac,"[Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac, Ruland1612-Baurac]",8,[B],[],[],"[{'entry_id': 'Ruland1612-Baurac', 'sense_idx': 1, 'def': None, 'raw': 'wird vom Schaum deß Glases oder Nitere.', 'translations': [{'type': 'translation', 'xml_lang': 'de', 'quote': 'wird vom Schaum deß Glases oder Nitere.'}]}, {'entry_id': 'Ruland1612-Baurac', 'sense_idx': 1, 'def': 'id est, sal gemmae.', 'raw': 'id est, sal gemmae.', 'translations': []}, {'entry_id': 'Ruland1612-Baurac', 'sense_idx': 1, 'def': 'i. e. id est sapphirium lithargyrum albificatum', 'raw': 'i. e. id est sapphiri...",8,"[id est, sal gemmae., i. e. id est sapphirium lithargyrum albificatum, id est, sal vitri; fex vitri, spuma vitri, fel vitri., id est, attinckar., id est, quodlibet genus salsuginis.]","[wird vom Schaum deß Glases oder Nitere., id est, sal gemmae., i. e. id est sapphirium lithargyrum albificatum, weiß gemacht: Glett., id est, sal vitri; fex vitri, spuma vitri, fel vitri., id est, attinckar., id est, quodlibet genus salsuginis., ist Pflaster damit die Fugen vinbstrichen werden / als deß Golds mit Eyerclar und Mehl, ist auch gesaltzen nitrum Armenisch und schwefelisch]","[{'entry_id': 'Ruland1612-Baurac', 'type': 'translation', 'xml_lang': 'de', 'quote': 'wird vom Schaum deß Glases oder Nitere.'}, {'entry_id': 'Ruland1612-Baurac', 'type': 'translation', 'xml_lang': 'de', 'quote': 'weiß gemacht: Glett'}, {'entry_id': 'Ruland1612-Baurac', 'type': 'translation', 'xml_lang': 'de', 'quote': 'ist Pflaster damit die Fugen vinbstrichen werden / als deß Golds mit Eyerclar und Mehl'}, {'entry_id': 'Ruland1612-Baurac', 'type': 'translation', 'xml_lang': 'de', 'quote': ..."
3,Magnesia,magnesia,"[Ruland1612-Magnesia, Ruland1612-Magnesia, Ruland1612-Magnesia, Ruland1612-Magnesia, Ruland1612-Magnesia]",5,[M],[],[],"[{'entry_id': 'Ruland1612-Magnesia', 'sense_idx': 1, 'def': 'id est, testudo vel sulphul.', 'raw': 'id est, testudo vel sulphul.', 'translations': []}, {'entry_id': 'Ruland1612-Magnesia', 'sense_idx': 1, 'def': 'Ist das gem schte Wasser im Lufft congelirt, daß dem Fewer widerstehet die Erde deß Steine sonser mercurius, mistio substantiarum, daß ganz darinn mercurius ist.', 'raw': 'Ist das gem schte Wasser im Lufft congelirt, daß dem Fewer widerstehet die Erde deß Steine sonser mercurius, mis...",5,"[id est, testudo vel sulphul., Ist das gem schte Wasser im Lufft congelirt, daß dem Fewer widerstehet die Erde deß Steine sonser mercurius, mistio substantiarum, daß ganz darinn mercurius ist., Ist ein Stein in der Krafft deß Marcasitae, oder es ist ein Stein dem haematiti gleich., id est, foemina.]","[id est, testudo vel sulphul., Ist das gem schte Wasser im Lufft congelirt, daß dem Fewer widerstehet die Erde deß Steine sonser mercurius, mistio substantiarum, daß ganz darinn mercurius ist., Ist ein Stein in der Krafft deß Marcasitae, oder es ist ein Stein dem haematiti gleich., id est, foemina., Wißmath oder taub Ertz.]","[{'entry_id': 'Ruland1612-Magnesia', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Ist das gem schte Wasser im Lufft congelirt, daß dem Fewer widerstehet die Erde deß Steine sonser mercurius, mistio substantiarum, daß ganz darinn mercurius ist.'}, {'entry_id': 'Ruland1612-Magnesia', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Ist ein Stein in der Krafft deß Marcasitae, oder es ist ein Stein dem haematiti gleich.'}, {'entry_id': 'Ruland1612-Magnesia', 'type': 'translation', 'xml_l..."
4,Aquila,aquila,"[Ruland1612-Aquila, Ruland1612-Aquila, Ruland1612-Aquila, Ruland1612-Aquila]",4,[A],[],[],"[{'entry_id': 'Ruland1612-Aquila', 'sense_idx': 1, 'def': 'quae auium regina est, vsurpatur nomine pro sale Armoniaco propter leuitatem in sublimationibus', 'raw': ', quae auium regina est, vsurpatur nomine pro sale Armoniaco propter leuitatem in sublimationibus. Verum Paracel 46 LEXICON ALCHEMIAE MART. sus in multis accipi vult pro Mercurio praecipitato. Ein Adier wirdt in der Kunst fuͤr Salmiax vnd praecipitat gebraucht.', 'translations': []}, {'entry_id': 'Ruland1612-Aquila', 'sense_idx':...",4,"[quae auium regina est, vsurpatur nomine pro sale Armoniaco propter leuitatem in sublimationibus, id est, Arsenicum, vel sulphur., id est, aurum guttendo, fidelo, edel, sedalo]","[, quae auium regina est, vsurpatur nomine pro sale Armoniaco propter leuitatem in sublimationibus. Verum Paracel 46 LEXICON ALCHEMIAE MART. sus in multis accipi vult pro Mercurio praecipitato. Ein Adier wirdt in der Kunst fuͤr Salmiax vnd praecipitat gebraucht., , Adler ist der Vogel Leim / Zaͤhschleim / der in der ersten coagulation r otlecht erscheinet vund wirdt funden in Istria. Ist auch der Geist der in Erden verwandelt das ist Mercurii spiritus, der Stein selbs. Die Turba sagt fol. 73...","[{'entry_id': 'Ruland1612-Aquila', 'type': 'translation', 'xml_lang': 'de', 'quote': 'Adler ist der Vogel Leim / Zaͤhschleim / der in der ersten'}, {'entry_id': 'Ruland1612-Aquila', 'type': 'translation', 'xml_lang': 'de', 'quote': 'otlecht erscheinet vund wirdt funden in Istria. Ist auch der Geist der in Erden verwandelt das ist'}, {'entry_id': 'Ruland1612-Aquila', 'type': 'translation', 'xml_lang': 'de', 'quote': 'der Stein selbs. Die Turba sagt fol. 73. Eines ieden Dinges volkommenheit is..."


In [115]:
print(lexeme_df[lexeme_df["sense_count"]>1].sample(10, random_state=0))

           Lemma LemmaCanonical  \
8    Putrefactio    putrefactio   
59     Dragantum      dragantum   
92      Metallum       metallum   
104      Plumbum        plumbum   
97         Muria          muria   
30     Pompholix      pompholix   
44       Asseres        asseres   
33        Rutrum         rutrum   
63     Extractio      extractio   
26     Mercurius      mercurius   

                                                                                            entry_ids  \
8    [Ruland1612-Putrefactio, Ruland1612-Putrefactio, Ruland1612-Putrefactio, Ruland1612-Putrefactio]   
59                                                       [Ruland1612-Dragantum, Ruland1612-Dragantum]   
92                                                         [Ruland1612-Metallum, Ruland1612-Metallum]   
104                                                          [Ruland1612-Plumbum, Ruland1612-Plumbum]   
97                                                               [Ruland1612-Muria, Rulan

In [116]:
len(lexeme_df)

2985

In [119]:
lexeme_df["sense_count"].value_counts()

sense_count
1     2831
2       91
3       25
0       24
4       10
15       1
5        1
9        1
8        1
Name: count, dtype: int64

In [82]:
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', 500)

In [83]:
print(lexeme_df.sample(10))

                                     Lemma  \
1898                              Magnalia   
1337                            Ferramenta   
845                      Canales recludere   
805   Callecamenon, Calcucementum casticum   
646                                  Azoch   
1488                         GAgates lapis   
1421                     Fornacis magister   
267                                 Alcadp   
1408                                   Fom   
1300                        Eurnus simplex   

                            LemmaCanonical  \
1898                              magnalia   
1337                            ferramenta   
845                      canales recludere   
805   callecamenon, calcucementum casticum   
646                                  azoch   
1488                         gagates lapis   
1421                     fornacis magister   
267                                 alcadp   
1408                                   fom   
1300                        eurnu

In [120]:
# for preprocessing the latin texts, we will use a module located outside of the current repository, specifically at the same level as the current project.
current_working_directory = os.getcwd()
relative_path = '../../latin-preprocessing/'  # change according to your location...
module_path = os.path.abspath(os.path.join(current_working_directory, relative_path))
if module_path not in sys.path:
    sys.path.insert(0, module_path)
# Now import the module
import tomela

In [121]:
doc = tomela.nlp("Merucurius metallorum")
lemmatized_string = " ".join([t.lemma_ for t in doc if t.pos_ not in ["PUNCT"]]).lower()
lemmatized_string

'merucurius metallum'

In [122]:
def lemmatizer(string):
    if (string is not None) and (len(string.split()) > 2):
        doc = tomela.nlp(string)
        lemmatized_list = []
        for t in doc:
            if t.pos_ not in ["PUNCT"]:
                if t.lemma_ == "":
                    lemmatized_list.append(t.text.lower())
                else:
                    lemmatized_list.append(t.lemma_.lower())
        lemmatized = " ".join(lemmatized_list)
    else:
        lemmatized = string.lower()
    if lemmatized == "":
        lemmatized = None
    return lemmatized
lexeme_df["relemmatized"] = lexeme_df["Lemma"].apply(lemmatizer)

In [125]:
lexeme_df.to_parquet("../data/ruland-dictionaries.parquet")

In [126]:
lexeme_df.to_csv("../data/ruland-dictionaries.csv")