In [22]:
import pandas as pd
import duckdb
import os
import os, json
from uuid import uuid4
import pandas as pd
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 500)  # adjusts total line width before wrapping
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
import faiss
from sklearn.preprocessing import normalize


In [8]:
model = SentenceTransformer("sentence-transformers/LaBSE")

In [3]:
conn = duckdb.connect('/srv/data/grela_v0-2.duckdb', read_only=True)

In [4]:
### load vulgate embeddings
vulgate_df = conn.execute("""
    SELECT sentence_id, embedding
    FROM sentence_embeddings
    WHERE grela_id LIKE 'vulgate_%'
""").fetchdf()

# Convert JSON to numpy array
embeddings = np.array([json.loads(e) for e in vulgate_df['embedding']]).astype('float32')
sentence_ids = vulgate_df['sentence_id'].tolist()

In [5]:
normalized_embeddings = normalize(embeddings, norm='l2')

In [7]:
d = normalized_embeddings.shape[1]  # 768
index = faiss.IndexFlatIP(d)        # Inner product = cosine if normalized
index.add(normalized_embeddings)

In [10]:

query = "The Word became flesh and made his dwelling among us. We have seen his glory, the glory of the one and only Son, who came from the Father, full of grace and truth."
embedding = model.encode([query], convert_to_numpy=True)
# Normalize for cosine similarity
embedding = normalize(embedding, norm='l2')

In [16]:
k = 5  # number of nearest neighbors
scores, indices = index.search(embedding, k)

In [23]:
results = []
for idx, score in zip(indices[0], scores[0]):
    sid = sentence_ids[idx]
    row = conn.execute(f"""
        SELECT s.sentence_id, s.text, w.title
        FROM sentences s
        JOIN works w ON s.grela_id = w.grela_id
        WHERE s.sentence_id = '{sid}'
    """).fetchone()

    if row:
        sentence_id, text, title = row
        results.append({
            "score": score,
            "sentence_id": sentence_id,
            "text": text,
            "title": title
        })

# Convert to DataFrame
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by="score", ascending=False).reset_index(drop=True)
df_results

Unnamed: 0,score,sentence_id,text,title
0,0.787623,vulgate_tlg0031.tlg004.obi-lat:1.14,et Verbum caro factum est et habitavit in nobis et vidimus gloriam eius gloriam quasi unigeniti a Patre plenum gratiae et veritatis,Vulgate - John
1,0.588898,vulgate_tlg0527.tlg048.obi-lat:9.6,parvulus enim natus est nobis filius datus est nobis et factus est principatus super umerum eius et vocabitur nomen eius Admirabilis consiliarius Deus fortis Pater futuri saeculi Princeps pacis,Vulgate - Isaiah
2,0.55222,vulgate_tlg0031.tlg023.obi-lat:5.20,et scimus quoniam Filius Dei venit et dedit nobis sensum ut cognoscamus verum Deum et simus in vero Filio eius hic est verus Deus et vita aeterna,Vulgate - 1 John
3,0.508574,vulgate_tlg0527.tlg005.obi-lat:5.24,ecce ostendit nobis Dominus Deus noster maiestatem et magnitudinem suam vocem eius audivimus de medio ignis et probavimus hodie quod loquente Deo cum homine vixerit homo,Vulgate - Deuteronomy
4,0.505002,vulgate_tlg0527.tlg046.obi-lat:9.7,et auferam sanguinem eius de ore eius et abominationes eius de medio dentium eius et relinquetur etiam ipse Deo nostro et erit quasi dux in Iuda et Accaron quasi Iebuseus,Vulgate - Zechariah


In [None]:
# load the register

register = conn.execute("""
    SELECT s.sentence_id, s.text, e.embedding
    FROM sentence_embeddings e
    JOIN sentences s ON e.sentence_id = s.sentence_id
    WHERE e.grela_id = 'cc_10265'
""").fetchdf()

In [24]:
conn.close()