In [47]:
import requests
import re
import pandas as pd
import xml.etree.ElementTree as ET
import json
import os
import sys
import matplotlib.pyplot as plt
from collections import Counter


In [2]:
# Define the URL for the XML file
url = "https://raw.githubusercontent.com/sarahalang/alchemical-dictionaries/refs/heads/main/Ruland1612/Ruland.xml"

# Fetch the XML file from the URL
response = requests.get(url)

In [3]:
xml_content = response.content
tree = ET.ElementTree(ET.fromstring(xml_content))
root = tree.getroot()
TEI_NS = {'tei': 'http://www.tei-c.org/ns/1.0'}


In [4]:
entries = root.findall('.//tei:entry', TEI_NS)


In [13]:
entry_list = []
# Parse each entry
# Parse each entry
for entry in entries:
    entry_dict = {}
    # Extract entry attributes
    entry_dict['ID'] = entry.attrib.get('n', None)  # Replace "No ID" with None
    entry_dict['entry_xml'] = entry
    entry_dict['Type'] = entry.attrib.get('type', None)  # Replace "No Type" with None
    entry_dict['XML_ID'] = entry.attrib.get('{http://www.w3.org/XML/1998/namespace}id', None)  # Replace "No XML ID" with None

    # Extract lemma
    lemma = entry.find('.//tei:form[@type="lemma"]', TEI_NS)
    entry_dict['Lemma'] = lemma.text.strip() if lemma is not None else None

    # Extract phrase (if any)
    phrase = entry.find('.//tei:form[@type="phrase"]', TEI_NS)
    entry_dict['Phrase'] = phrase.text.strip() if phrase is not None else None

    # Extract variants
    variants = entry.findall('.//tei:form[@type="variant"]', TEI_NS)
    entry_dict['Variants'] = [variant.text.strip() for variant in variants if variant is not None]

    # Extract notes
    notes = entry.findall('.//tei:note', TEI_NS)
    entry_dict['Notes'] = []
    for note in notes:
        # Handle all nested text within a note, including <lb/> and other sub-elements
        note_text = ''.join(note.itertext()).replace('\n', ' ').replace('<lb/>', ' ').strip()
        entry_dict['Notes'].append(note_text)

    # Extract senses and definitions
    senses = entry.findall('.//tei:sense', TEI_NS)
    entry_dict['Definitions'] = []
    for sense in senses:
        # Extract direct text content of <sense>
        sense_text = ''.join(sense.itertext()).replace('\n', ' ').replace('<lb/>', ' ').strip()
        if sense_text:
            entry_dict['Definitions'].append(sense_text)

    # Extract quotes (translations)
    quotes = entry.findall('.//tei:quote', TEI_NS)
    entry_dict['Translations'] = [quote.text.strip() for quote in quotes if quote is not None]

    # Append entry dictionary to the list
    entry_list.append(entry_dict)

In [14]:
entries_df = pd.DataFrame(entry_list)

In [15]:
entries_df[1000:1020]

Unnamed: 0,ID,entry_xml,Type,XML_ID,Lemma,Phrase,Variants,Notes,Definitions,Translations
1000,Ruland1612-Coloratio-per-calorem-solum,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",C,,Coloratio per calorem solum,,[],[],"[est modus colorandi, cum e potentia in actum,...",[]
1001,Ruland1612-Coloratio-per-ablutionem,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",C,,Coloratio per ablutionem,,[],[],"[est, cum fuscedines spiritales, aliaeque sord...",[]
1002,Ruland1612-Colica,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",C,,Colica,,[],[],"[est tartarus resolutus in intestinis, morbusq...",[Bauchgrimmen die Mutter]
1003,Ruland1612-Collecta,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",C,,Collecta,,[],[],"[, symbolum, Zubuß ...",[Zubuß]
1004,Ruland1612-Collectam,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",C,,Collectam,,[],[],"[exigere a Dominis, Z...",[Zubuß fordern]
1005,Ruland1612-Combustio,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",C,,Combustio,,[],[],"[est ignitio, corpora comburendo in caleem red...",[]
1006,Ruland1612-Cometz,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",C,,Cometz,,[],[],[est gutta semis.],[]
1007,Ruland1612-Comisdi,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",C,,Comisdi,,[],[],"[id est, gummi Arabicum.]",[]
1008,Ruland1612-Comminutio,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",C,,Comminutio,,[],[],"[est, cum in minutissimas partes per collisum ...",[]
1009,Ruland1612-Complexio,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",C,,Complexio,,[],[],[est natura partis. Alias est qualitas calidi ...,[Es ist ein Eigen chafft]


In [16]:
# for preprocessing the latin texts, we will use a module located outside of the current repository, specifically at the same level as the current project.
current_working_directory = os.getcwd()
relative_path = '../../latin-preprocessing/'  # change according to your location...
module_path = os.path.abspath(os.path.join(current_working_directory, relative_path))
if module_path not in sys.path:
    sys.path.insert(0, module_path)
# Now import the module
import tomela

In [22]:
doc = tomela.nlp("Mercurium metallorum,")
lemmatized_string = " ".join([t.lemma_ for t in doc if t.pos_ not in ["PUNCT"]]).lower()
lemmatized_string

'mercurius metallum'

In [23]:
def lemmatizer(string):
    if string is not None:
        doc = tomela.nlp(string)
        lemmatized_list = []
        for t in doc:
            if t.pos_ not in ["PUNCT"]:
                if t.lemma_ == "":
                    lemmatized_list.append(t.text.lower())
                else:
                    lemmatized_list.append(t.lemma_.lower())
        lemmatized = " ".join(lemmatized_list)
    else:
        lemmatized = None
    if lemmatized == "":
        lemmatized = None
    return lemmatized
entries_df["relemmatized"] = entries_df["Lemma"].apply(lemmatizer)

In [24]:
entries_df[entries_df["Lemma"].str.contains("Mercurius", na=False)]

Unnamed: 0,ID,entry_xml,Type,XML_ID,Lemma,Phrase,Variants,Notes,Definitions,Translations,relemmatized
2049,Ruland1612-Mercurius,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius,,[],[],"[id est, sulphur. Mercuri...",[Mercurius ist in allen Chymistischen Buͤchern...,mercurius
2051,"Ruland1612-Mercurius-crystallinus,","[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,"Mercurius crystallinus,",,[],[],[qui saepe sublimatus est in formam cry¬stalli...,[],mercurius crystallinus
2052,Ruland1612-Mercurius-corallinus,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius corallinus,,[],[],[qui per oleum ouorum & aquas al...,[],mercurius corallinus
2053,Ruland1612-Mercurius-crudus,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius crudus,,[],[],"[est is, qui nondum separatus est a sua matric...",[der noch in seim Ertz liget],mercurius crudus
2054,Ruland1612-Mercurius-laxus,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius laxus,,[],[],[est turbith minerale.],[],mercurius laxus
2055,Ruland1612-Mercurius-argentipigmentum,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius argentipigmentum,,[],[],"[ist Schwefel victriol, Alaun Saltz-dieweils d...","[ist Schwefel victriol, Alaun]",mercurius argentipigmentus
2056,Ruland1612-Mercurius-metallorum,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius metallorum,,[],[],[Ist darauß die Natur der Coͤrper gezo¬gen wir...,[Ist darauß die Natur der Coͤrper gezo¬],mercurius metallum
2057,Ruland1612-Mercurius,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius,,[],[],"[est principium materiale, vaporosum, naturę a...",[],mercurius
2058,Ruland1612-Mercurius,"[[[], []]]",M,,Mercurius,,[],[],[argentum viuum CC. S. Z.],[],mercurius
2059,Ruland1612-Mercurius-regeneratus,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius regeneratus,,[],[],[est primum ens Mercurii.],[],mercurius regenero


In [25]:
emlap_metadata = pd.read_csv("https://raw.githubusercontent.com/CCS-ZCU/EMLAP_ETL/refs/heads/master/data/emlap_metadata.csv", sep=";")
emlap_metadata.head(5)

Unnamed: 0,working_title,No.,is_done,is_noscemus,if_noscemus_id,"#if is_noscemus = True, don't transcribe",AUTHORSHIP,is_one_author,#if more than 1 author skip section and choose compendium below,is_author_known,...,link,source_of_file,origin_of_copy,REFERENCES,catalogue_reference,secondary_references,general_comments,OTHER,filename,Unnamed: 64
0,"Augurello, Chrysopoeia",100001,True,True,713324.0,,,True,,True,...,https://wiki.uibk.ac.at/noscemus/Chrysopoeia,Noscemus,Unknown,,Noscemus Wiki,Soranzo 2019,The 1518 Basel version is also in Noscemus,,"Augurello,_Giovanni_Aurelio_-_Chrysopoeia__Ven...",
1,"Pseudo-Lull, Secretis",100002,True,False,,,,True,,True,...,https://www.digitale-sammlungen.de/en/view/bsb...,MDZ,MBS,,Hirsch 1950,,"There is a prior, 1514 edition of De secretis ...",,Pseudo-Lull1518_De_secretis_naturae_MDZ.pdf,
2,"Pantheus, Ars Transmutatione",100003,True,False,,,,True,,True,...,,GB,BL,,,,This book was first published in 1518 with an ...,,Pantheus1518_Ars_Transmutationis_Metallicae_BL...,
3,"Pantheus, Commentarium",100004,True,False,,,,True,,True,...,https://www.digitale-sammlungen.de/en/view/bsb...,MDZ,MSB,,,,This 1519 book is catalogued wrongly by many l...,,Pantheus1519_Commentarium_Transmutationis_Meta...,
4,"Pantheus, Voarchadumia",100005,True,False,,,,True,,True,...,,ONB,ONB,,,,Dedicated to Leonellus Marquis of Estense,,Pantheus1530_Voarchadumia_ONB.pdf,


In [1]:
emlap_metadata.columns

NameError: name 'emlap_metadata' is not defined

In [26]:
# test loading individual file
work_id = 100001
dir = "../data/sents_data/"
filename = str(work_id) + ".json"
with open(dir + filename, "r") as f:
    sents_data = json.load(f)
# look at a random selection of morphologically annotated and lemmatized sentences
sents_data[200:210]

[[100001,
  200,
  'Aut animam dicas:',
  [['Aut', 'aut', 'CCONJ', [0, 3], [17], [12]],
   ['animam', 'anima', 'NOUN', [4, 10], [17], [12]],
   ['dicas', 'dico', 'VERB', [11, 16], [17], [12]],
   [':', ':', 'PUNCT', [16, 17], [17], [12]]]],
 [100001,
  201,
  'sed eum:',
  [['sed', 'sed', 'CCONJ', [0, 3], [17], [12]],
   ['eum', 'is', 'PRON', [4, 7], [17], [12]],
   [':', ':', 'PUNCT', [7, 8], [17], [12]]]],
 [100001,
  202,
  'qui solus utroque Participans in idem simul haec extrema reducat.',
  [['qui', 'qui', 'PRON', [0, 3], [17], [12]],
   ['solus', 'solus', 'DET', [4, 9], [17], [12]],
   ['utroque', 'uterque', 'DET', [10, 17], [17], [12]],
   ['Participans', 'participo', 'VERB', [18, 29], [17], [13]],
   ['in', 'in', 'ADP', [30, 32], [17], [13]],
   ['idem', 'idem', 'DET', [33, 37], [17], [13]],
   ['simul', 'simul', 'ADV', [38, 43], [17], [13]],
   ['haec', 'hic', 'DET', [44, 48], [17], [13]],
   ['extrema', 'extremus', 'ADJ', [49, 56], [17], [13]],
   ['reducat', 'reduco', 'VERB

In [29]:
# function to load sentence data from any EMLAP text based on its ID
def load_sentences_data(id):
    dir = "data/sents_data"
    filename = str(work_id) + ".json"
    url = "https://raw.githubusercontent.com/CCS-ZCU/EMLAP_ETL/refs/heads/master/{0}/{1}".format(dir,filename)
    resp = requests.get(url)
    sents_data = resp.json()
    return sents_data

In [30]:
# load the sentence data from all texts into one list
emlap_sents_data = []
for no in emlap_metadata["No."]:
   sents_data = load_sentences_data(no)
   for sent_data in sents_data:
        emlap_sents_data.append(sent_data)


In [32]:
def get_phrase_instances(phrase):
    phrase_instances = []
    if phrase is not None:
        phrase = phrase.lower().replace("v", "u")
        for sent_data in emlap_sents_data:
            try:
                lemmata_string = " ".join([t[1].lower().replace("v", "u") for t in sent_data[3]])
                # regular expression for detecting presence of phrase in lemmata string, it has to match only cases
                # where there is a space or beginning/end of the string immediatelly before/after the phrase instance in the lemmata string
                if re.search(r"\b" + phrase + r"\b", lemmata_string) is not None:
                    phrase_instances.append((sent_data[0], sent_data[1], sent_data[2]))
            except:
                pass
    return phrase_instances


In [40]:
# look at all dictionary entries containing "Mercurius"
entries_df[entries_df["Lemma"].str.contains("Mercurius", na=False)]

Unnamed: 0,ID,entry_xml,Type,XML_ID,Lemma,Phrase,Variants,Notes,Definitions,Translations,relemmatized
2049,Ruland1612-Mercurius,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius,,[],[],"[id est, sulphur. Mercuri...",[Mercurius ist in allen Chymistischen Buͤchern...,mercurius
2051,"Ruland1612-Mercurius-crystallinus,","[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,"Mercurius crystallinus,",,[],[],[qui saepe sublimatus est in formam cry¬stalli...,[],mercurius crystallinus
2052,Ruland1612-Mercurius-corallinus,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius corallinus,,[],[],[qui per oleum ouorum & aquas al...,[],mercurius corallinus
2053,Ruland1612-Mercurius-crudus,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius crudus,,[],[],"[est is, qui nondum separatus est a sua matric...",[der noch in seim Ertz liget],mercurius crudus
2054,Ruland1612-Mercurius-laxus,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius laxus,,[],[],[est turbith minerale.],[],mercurius laxus
2055,Ruland1612-Mercurius-argentipigmentum,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius argentipigmentum,,[],[],"[ist Schwefel victriol, Alaun Saltz-dieweils d...","[ist Schwefel victriol, Alaun]",mercurius argentipigmentus
2056,Ruland1612-Mercurius-metallorum,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius metallorum,,[],[],[Ist darauß die Natur der Coͤrper gezo¬gen wir...,[Ist darauß die Natur der Coͤrper gezo¬],mercurius metallum
2057,Ruland1612-Mercurius,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius,,[],[],"[est principium materiale, vaporosum, naturę a...",[],mercurius
2058,Ruland1612-Mercurius,"[[[], []]]",M,,Mercurius,,[],[],[argentum viuum CC. S. Z.],[],mercurius
2059,Ruland1612-Mercurius-regeneratus,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",M,,Mercurius regeneratus,,[],[],[est primum ens Mercurii.],[],mercurius regenero


In [41]:
# check with a simple one word phrase
output = get_phrase_instances("aqua")
len(output)

16821

In [42]:
# look at first 10 instances
output[:10]

[(100001,
  530,
  'argento nec minus ipsi Uiuo etiam terrae partes miscentur in omnis Prorsus aquae partes nullo discrimine iunctae.'),
 (100001, 629, 'Tantus aquae uigor est:'),
 (100001, 668, 'Acris aquae hoc etiam dirae uiolentia rodit:'),
 (100001, 682, 'haec nanque potent ia caelo Caelestis descendit aquae:'),
 (100001, 2793, 'uina permutans Aquis?'),
 (100002,
  91,
  'Que uero mutat duram materiam ad principium nature sui ad finem, quod sit apta ad genera tionem ad recipiendum uirtutes in aquis infusas fortium mineralium uirtutum, sicuti ipsam & accipit & recipit uirtutem informatiuam embrionis.'),
 (100002,
  100,
  '& illas quoque includimus in aquis grauidis ex uirtutibus minera ibus ad hunc finem:'),
 (100002,
  145,
  'PIn primo em non consideramus nisi aquas aereas, in duobus uero ultimis aereas & terreas.'),
 (100002,
  156,
  'Et cum hoc sufficit tibi modus generalis, aquarum tam uegetabilium quam mineralium ad medicinam humanam & lapidem philosophorum, & lapides precio

In [43]:
# repeat the same with a multiword phrase (all words lemmatized)
output = get_phrase_instances("mercurius metallum")
len(output)

27

In [44]:
output[:10]

[(100007,
  1188,
  'Uiuum causat metalla quamuis adhuc bene differunt unum ab alio, secundum quod plus existit uiscositate terrae infectum, cum tamen sulphur simplex uiuum causans aurum & argentum non est nisi uapor calidus & siccus generatus ex purissima siccitate terrestri, in qua omnibus moribus predominatur ignis, & illud dicitur elementum cum mercurio metallorum &c. sulphur patet supra.'),
 (100010,
  404,
  'Compraehendis igitur ex supradictis rationibus, nostrum argentum uiuum, non esse argenturm uiuum, sed sal in similitudine uerae calcis communis, quia argentum uiuum, siue mercurius metallorum per calcinationem & reductionem conuertitur in sal, ut manifeste apparet per totum librum testamenti.'),
 (100010,
  504,
  'Prius diximus, per calcinationem, mercurium metallorum, conuerti in sal, Cum uero hoc sal multum terrestreitatis habeat, inde certam fixionem acquisiuit.'),
 (100011,
  3739,
  'Uiuum causat metalla, quamuis unum differt ab alio secundum quod plus existit uiscosit

In [45]:
entries_df["emlap_instances"] = entries_df["relemmatized"].apply(get_phrase_instances)

In [50]:
entries_df["emlap_instances_N"] = entries_df["emlap_instances"].apply(len)

In [51]:
entries_df.sort_values("emlap_instances_N", ascending=False)[:20]

Unnamed: 0,ID,entry_xml,Type,XML_ID,Lemma,Phrase,Variants,Notes,Definitions,Translations,relemmatized,emlap_instances,instances_ids,emlap_instances_N
1620,Ruland1612-Hunc,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",H,,Hunc,,[],[],"[id est, Iupiter, stannum vel hucci.]",[],hic,"[(100001, 6, & Musis hanc commendauimus almis ...","[100001, 100001, 100001, 100001, 100001, 10000...",31023
396,Ruland1612-Aqua,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",A,,Aqua,,[],[1. Dieses Wasser reiniget/maschet/meitet/mach...,"[, id est, liquor, da...","[das Wasser ist auch immer darvonnen / da¬, Au...",aqua,"[(100001, 530, argento nec minus ipsi Uiuo eti...","[100001, 100001, 100001, 100001, 100001, 10000...",16821
1044,Ruland1612-Corpus,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",C,,Corpus,,[],[],[Clang. Buce. Der Coͤ...,[Der Coͤrper ist ein metallisch Wesen/dar¬],corpus,"[(100001, 101, & uasto uenas sub corpore condi...","[100001, 100001, 100001, 100001, 100001, 10000...",10750
1640,Ruland1612-Ignis,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",I,,Ignis,,[],[],[Ist nach etlicher Meinung das Oel sokauff der...,[Ist nach etlicher Meinung das Oel sokauff der...,ignis,"[(100001, 127, Tu molli e gremio surgens uxori...","[100001, 100001, 100001, 100001, 100001, 10000...",10030
1641,Ruland1612-Ignis,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",I,,Ignis,,[],[],[Ignis pro lapide philos. ...,[Bernhardus sagt: Das habe er],ignis,"[(100001, 127, Tu molli e gremio surgens uxori...","[100001, 100001, 100001, 100001, 100001, 10000...",10030
2498,Ruland1612-Rebus,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",R,,Rebus,,[],[],"[vltima rerum materia, ...",[die letzte Matery aller Ding],res,"[(100001, 4, Ut rerum inuolucris tantarum euol...","[100001, 100001, 100001, 100001, 100001, 10000...",8056
545,Ruland1612-Aurum,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",A,,Aurum,,[],"[Natiuum purum, quod a natura tale est, cuius ...","[, à Germanis Goldtsa...","[Goldtsa, Goldt ist der Coͤrper / ond ferment ...",aurum,"[(100001, 76, Interea certis hominum uis ulla ...","[100001, 100001, 100001, 100001, 100001, 10000...",6636
2222,Ruland1612-Oleum,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",O,,Oleum,,[],[],[est Destillatum aut Secretum.],[],oleum,"[(100001, 2245, Nunc oleo demissa leues:), (10...","[100001, 100002, 100002, 100002, 100002, 10000...",6292
2217,Ruland1612-Oleum,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",O,,Oleum,,[],[],[heist auch das erste Wass. ridas von sanguine...,[heist auch das erste Wass. ridas von sanguine...,oleum,"[(100001, 2245, Nunc oleo demissa leues:), (10...","[100001, 100002, 100002, 100002, 100002, 10000...",6292
2216,Ruland1612-Oleum,"[[[], [<Element '{http://www.tei-c.org/ns/1.0}...",O,,Oleum,,[],[],"[id est, ignis, wi...","[wirdt außgezogen von einer trocknen Sub¬, dis...",oleum,"[(100001, 2245, Nunc oleo demissa leues:), (10...","[100001, 100002, 100002, 100002, 100002, 10000...",6292


In [46]:
entries_df["instances_ids"] = entries_df["emlap_instances"].apply(lambda x: [ins[0] for ins in x])

In [90]:
entries_df.to_json("../data/large_files/ruland-emlap.json")