In [2]:
import pandas as pd
from gensim.models.fasttext import FastText
from gensim.models import KeyedVectors
import re
import os
import shutil
import sddk
import pickle
import json
import google_conf

In [3]:
emlap_catalogue = google_conf.setup(
    sheet_url="https://docs.google.com/spreadsheets/d/1bkHHTYc86K2IuEXqfYfkDNt5LovtvCU3gvqHIbVio88/edit?usp=sharing",
    service_account_path="../../../ServiceAccountsKey.json")
# Get the data and transpose
emlap_metadata_raw = google_conf.get_as_dataframe(emlap_catalogue.worksheet("Copy_of_Catalogue_08_04_2025"), row=1,
                                                  include_index=False)
emlap_metadata_raw.set_index("author_working", inplace=True)
emlap_metadata_raw.index.name = None
emlap_metadata = emlap_metadata_raw.T
emlap_metadata.index = emlap_metadata.index.astype(str)
emlap_metadata.head(5)

Unnamed: 0,No.,is_done,is_noscemus,if_noscemus_id,"#if is_noscemus = True, don't transcribe",AUTHORSHIP,is_one_author,#if more than 1 author skip section and choose compendium below,is_author_known,author_name,...,link,source_of_file,origin_of_copy,REFERENCES,catalogue_reference,secondary_references,general_comments,OTHER,filename,NaN
"Augurello, Chrysopoeia",100001,True,True,713324.0,,,True,,True,"Augurelli, Giovanni Aurelio",...,https://wiki.uibk.ac.at/noscemus/Chrysopoeia,Noscemus,Unknown,,Noscemus Wiki,Soranzo 2019,The 1518 Basel version is also in Noscemus,,"Augurello,_Giovanni_Aurelio_-_Chrysopoeia__Ven...",
"Pseudo-Lull, Secretis",100002,True,False,,,,True,,True,Pseudo-Lull,...,https://www.digitale-sammlungen.de/en/view/bsb...,MDZ,MBS,,Hirsch 1950,,"There is a prior, 1514 edition of De secretis ...",,Pseudo-Lull1518_De_secretis_naturae_MDZ.pdf,
"Pantheus, Ars Transmutatione",100003,True,False,,,,True,,True,"Panteo, Giovanni Agostino",...,,GB,BL,,,,This book was first published in 1518 with an ...,,Pantheus1518_Ars_Transmutationis_Metallicae_BL...,
"Pantheus, Commentarium",100004,True,False,,,,True,,True,"Panteo, Giovanni Agostino",...,https://www.digitale-sammlungen.de/en/view/bsb...,MDZ,MSB,,,,This 1519 book is catalogued wrongly by many l...,,Pantheus1519_Commentarium_Transmutationis_Meta...,
"Pantheus, Voarchadumia",100005,True,False,,,,True,,True,"Panteo, Giovanni Agostino",...,,ONB,ONB,,,,Dedicated to Leonellus Marquis of Estense,,Pantheus1530_Voarchadumia_ONB.pdf,


In [4]:
ids = emlap_metadata["No."]

In [5]:
filtered_vocab_df = pd.read_json("../data/filtered_vocab_df.json")
filtered_vocab_df.head(5)

Unnamed: 0,word,1501-1550,1551-1600,1601-1650,1651-1700,mean,in_lila_embeddings,in_lasla,in_operamaiora,transl
3299,dico,43255,122549,42457,72224,70121.25,True,True,True,"say, call, tell"
960,pars,33225,93059,44337,52385,55751.5,True,True,True,part
3871,possum,25029,82700,38391,76841,55740.25,True,True,True,"be able, can"
6039,habeo,31690,89199,39443,59770,55025.5,True,True,True,"have, hold, possess, consider, think"
2898,facio,31814,98524,36944,51438,54680.0,True,True,True,"do, make, handle"


In [6]:
# load emlap vocabulary data
with open('../../EMLAP_ETL/data/emlap_wordcounts_tups_v3-0.pkl', 'rb') as f:
    emlap_wordcounts_tups = pickle.load(f)

In [7]:
emlap_wordcounts_tups[:10]

[('aqua', 20126),
 ('facio', 16068),
 ('possum', 14366),
 ('dico', 14098),
 ('natura', 13101),
 ('corpus', 12498),
 ('ignis', 11634),
 ('habeo', 9810),
 ('res', 8935),
 ('pars', 8375)]

In [8]:
# Convert the tuples to a DataFrame for merging
emlap_df = pd.DataFrame(emlap_wordcounts_tups, columns=['word', 'emlap'])

In [9]:
# Merge the dataframes, combining filtered_vocab_df with emlap_df
# Use outer join to include words from both filtered_vocab_df and emlap_df
filtered_vocab_df = pd.merge(filtered_vocab_df, emlap_df, on='word', how='outer')

# Optional: Fill NaN values with 0 or another default value in 'emlap'
#filtered_vocab_df['emlap'] = filtered_vocab_df['emlap'].fillna(0)
filtered_vocab_df = filtered_vocab_df.fillna(0)

# View the updated dataframe
filtered_vocab_df.columns

Index(['word', '1501-1550', '1551-1600', '1601-1650', '1651-1700', 'mean',
       'in_lila_embeddings', 'in_lasla', 'in_operamaiora', 'transl', 'emlap'],
      dtype='object')

In [10]:
filtered_vocab_df["noscemus_sum"] = filtered_vocab_df["1501-1550"] + filtered_vocab_df["1551-1600"] + filtered_vocab_df["1601-1650"] + filtered_vocab_df["1651-1700"]
filtered_vocab_df.columns

Index(['word', '1501-1550', '1551-1600', '1601-1650', '1651-1700', 'mean',
       'in_lila_embeddings', 'in_lasla', 'in_operamaiora', 'transl', 'emlap',
       'noscemus_sum'],
      dtype='object')

In [11]:
filtered_vocab_df = filtered_vocab_df[['word', '1501-1550', '1551-1600', '1601-1650', '1651-1700', 'noscemus_sum', 'emlap', 'in_lila_embeddings', 'in_lasla', 'in_operamaiora', 'transl']].sort_values("emlap", ascending=False)

In [12]:
len(emlap_wordcounts_tups)

62988

In [13]:
filtered_vocab_df = filtered_vocab_df[
    (filtered_vocab_df["word"].isin(emlap_wordcounts_tups[:2000])) |
    ((filtered_vocab_df["noscemus_sum"] > 0) & (filtered_vocab_df["emlap"] >= 10))
    ]
filtered_vocab_df.tail(10)

Unnamed: 0,word,1501-1550,1551-1600,1601-1650,1651-1700,noscemus_sum,emlap,in_lila_embeddings,in_lasla,in_operamaiora,transl
10640,chorus,216.0,250.0,251.0,312.0,1029.0,10.0,True,True,True,"chorus, choral passage in a play, dancing/sing..."
13750,conseruator,22.0,58.0,14.0,40.0,134.0,10.0,True,True,True,
25549,ges,201.0,492.0,1023.0,281.0,1997.0,10.0,False,False,False,
38245,mutuo,51.0,127.0,90.0,83.0,351.0,10.0,True,True,True,"lend, exchange"
52311,scotia,46.0,256.0,252.0,501.0,1055.0,10.0,False,False,False,Scotland
52440,scutum,136.0,316.0,123.0,232.0,807.0,10.0,True,True,True,shield
13937,consultatio,33.0,97.0,213.0,189.0,532.0,10.0,True,True,True,full/mature deliberation/consideration/discuss...
24459,frenum,85.0,349.0,78.0,62.0,574.0,10.0,True,True,True,"bridle/harness/rein/bit, harnessed horses/team..."
25758,globosus,49.0,211.0,264.0,242.0,766.0,10.0,False,False,False,"round, spherical"
16399,deformitas,26.0,186.0,62.0,88.0,362.0,10.0,True,True,True,"ugliness, deformity, blemish, disfigurement, d..."


In [14]:
len(filtered_vocab_df)

4501

In [15]:
vocab_freqs = filtered_vocab_df.set_index("word")["emlap"].to_dict()

# FastText - development and testing

In [16]:
# a genrator for iteration without flooding the CPU
class SentsCorpus:
    def __iter__(self):
        source_path = "/srv/data/tome/tome-corpus/sents_data_id_jsons_v3-0/"
        for id in ids:
            f_sents_data = json.load(open(source_path + str(id) + ".json", "rb"))
            sents_n = len(f_sents_data)
            tokens_n = 0
            lemmata = []
            for (doc_id, sent_id, sent_text, sent_data) in f_sents_data:
                tokens_n += len(sent_data)
                lemmasent = []
                for wordform, lemma, tag, position, t_pages, t_textblocks in sent_data:
                    if tag in ["NOUN", "PROPN", "ADJ", "VERB"]:
                        lemmasent.append(lemma)
                lemmasent = [re.sub(r"\W*|\d*", "", t) for t in lemmasent]
                lemmasent = [l.lower() for l in lemmasent if len(l) > 2]
                try:
                    yield lemmasent
                except:
                    pass

In [17]:
# test with a small corpus from one decade
ids = emlap_metadata["No."]
vocab_freqs = filtered_vocab_df.set_index("word")["emlap"].to_dict()
corpus = SentsCorpus()

In [18]:
# how many sentences are there in the small corpus?
len([s for s in corpus])

220846

In [19]:
%%time
# let's train the model
model = FastText(vector_size=100, window=10, negative=25, ns_exponent=1, sg=1, epochs=15, workers=8, min_n=5)
model.build_vocab_from_freq(word_freq=vocab_freqs)
model.train(corpus, total_examples=len([s for s in corpus]), epochs=model.epochs)

CPU times: user 6min 44s, sys: 5.41 s, total: 6min 49s
Wall time: 2min 21s


(15403505, 22766685)

In [22]:
target_path = "/srv/data/tome/tome-corpus/vectors/"
try:
    os.mkdir(target_path)
except:
    print("already exists")
    pass

In [23]:
model.wv.save(target_path + "emlap_v1.wv")

In [24]:
# most similar words to harmonia
model.wv.most_similar("aqua")

[('acetum', 0.6966753005981445),
 ('qui', 0.6826157569885254),
 ('oleum', 0.6821884512901306),
 ('destillo', 0.6779437065124512),
 ('liquor', 0.6778212189674377),
 ('balneum', 0.6569727063179016),
 ('stillo', 0.6359564661979675),
 ('fontanus', 0.6272138357162476),
 ('sal', 0.6266695261001587),
 ('abluo', 0.6263033151626587)]

In [25]:
import copy
model_emlap = copy.deepcopy(model)

In [26]:
emlap_word_vectors = model_emlap.wv.vectors
emlap_vocabulary = model_emlap.wv.index_to_key
emlap_kv = KeyedVectors(vector_size=emlap_word_vectors.shape[1])
    # Fill the keyed vectors
for word, vector in zip(emlap_vocabulary, emlap_word_vectors):
    emlap_kv.add_vector(word, vector)

In [40]:
emlap_kv.most_similar("aqua")

[('acetum', 0.6966753005981445),
 ('qui', 0.6826157569885254),
 ('oleum', 0.6821884512901306),
 ('destillo', 0.6779437065124512),
 ('liquor', 0.6778212189674377),
 ('balneum', 0.6569727063179016),
 ('stillo', 0.6359564661979675),
 ('fontanus', 0.6272138357162476),
 ('sal', 0.6266695261001587),
 ('abluo', 0.6263033151626587)]

# Adding to comp vectors dict

In [41]:
with open("/srv/data/tome/noscemus/vectors/vectors_dict_comp.pkl", "rb") as file:
    vectors_dict_comp = pickle.load(file)

In [42]:
vectors_dict_comp["EMLAP"] = emlap_kv

In [43]:
vectors_dict_comp["NOSCEMUS - Biology"].most_similar("equus")[:10]

[('asinus', 0.7495403289794922),
 ('mulus', 0.7450479865074158),
 ('bos', 0.687466561794281),
 ('iumentum', 0.685393214225769),
 ('currus', 0.6534374356269836),
 ('camelus', 0.63750821352005),
 ('eques', 0.6206961274147034),
 ('ungula', 0.6147962808609009),
 ('ceruus', 0.576291024684906),
 ('ueho', 0.5743056535720825)]

In [44]:
with open("/srv/data/tome/noscemus/vectors/vectors_dict_comp.pkl", "wb") as file:
    pickle.dump(vectors_dict_comp, file)

In [46]:
with open("/srv/webserver/apps/iweems_app/data/vectors_dict_comp_v0-3.pkl", "wb") as file:
    pickle.dump(vectors_dict_comp, file)

In [47]:
vectors_dict_comp.keys()

dict_keys(['NOSCEMUS - 1501-1550', 'NOSCEMUS - 1551-1600', 'NOSCEMUS - 1601-1650', 'NOSCEMUS - 1651-1700', 'NOSCEMUS - Alchemy/Chemistry', 'NOSCEMUS - Astronomy/Astrology/Cosmography', 'NOSCEMUS - Biology', 'NOSCEMUS - Geography/Cartography', 'NOSCEMUS - Mathematics', 'NOSCEMUS - Medicine', 'NOSCEMUS - Meteorology/Earth sciences', 'NOSCEMUS - Physics', 'LASLA', 'Opera Maiora', 'EMLAP'])