In [None]:
import os
import warnings
import requests
import umap
import numpy as np
import pandas as pd
import pickle


# Suppress TensorFlow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress INFO and WARNING messages
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # Disable oneDNN optimizations
os.environ['CUDA_VISIBLE_DEVICES'] = ''   # Use CPU only (optional)
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Disable tokenizer parallelism warnings

# Suppress other warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

warnings.filterwarnings('ignore')

# Or for specific warning types:
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)


In [None]:
df = pd.read_parquet('../data/register_matches_v1.parquet')
df

Unnamed: 0,sentence_id,text,vulgate_text,score,citation,vulgate_sentence_id
2412,cc_10265_2412,#posui te hodie super gentes et regna ut euell...,ecce constitui te hodie super gentes et super ...,0.899080,Jeremiah 1.10,vulgate_tlg0527.tlg049.obi-lat:1.10
3489,cc_10265_3489,"#quoniam data est a Domino potestas uobis, et ...",quoniam data est a Domino potestas vobis et vi...,0.866035,Wisdom 6.4,vulgate_tlg0527.tlg033.obi-lat:6.4
3480,cc_10265_3480,His quidem qui secundum patientiam boni operis...,his quidem qui secundum patientiam boni operis...,0.865040,Romans 2.7,vulgate_tlg0031.tlg006.obi-lat:2.7
5322,cc_10265_5322,#tu es Petrus et super hanc petram aedificabo ...,et ego dico tibi quia tu es Petrus et super ha...,0.860747,Matthew 16.18,vulgate_tlg0031.tlg001.obi-lat:16.18
3823,cc_10265_3823,#secundum multitudinem dolorum meorum in corde...,secundum multitudinem dolorum meorum in corde ...,0.856230,Psalms 93.19,vulgate_tlg0527.tlg027.obi-lat:93.19
...,...,...,...,...,...,...
3332,cc_10265_3332,De caetero rogamus te et multum admonemus ut o...,ut haec tibi verbis illius diceremus obsecro u...,0.600273,Genesis 50.17,vulgate_tlg0527.tlg001.obi-lat:50.17
2461,cc_10265_2461,"Praeterea monemus ut studeatis terram uestram,...",omnem causam quae venerit ad vos fratrum vestr...,0.600268,2 Chronicles 19.10,vulgate_tlg0527.tlg016.obi-lat:19.10
2158,cc_10265_2158,Praeterea si sancta Romana mater Ecclesia cont...,sed in testimonium inter nos et vos et subolem...,0.600197,Joshua 22.27,vulgate_tlg0527.tlg006.obi-lat:22.27
5112,cc_10265_5112,"et ejus conjugem, ut tam insigni patrono reuer...",vestes sanctas in ministerio Aaron sacerdoti e...,0.600177,Exodus 31.10,vulgate_tlg0527.tlg002.obi-lat:31.10


In [None]:
# ─────────────────────────── 2. LLM system prompt with descriptions ──────────────────────────
SYSTEM = """
You are a Latin scholar, that specializes in the translation from Latin to English.

Your will translate each given Latin sentence into modern day English. Be precise and accurate.

Respond **only with the translation**. Don't output any explanations.
""".strip()

# ─────────────────────────── 3. Few-shot examples ──────────────────────────
FEWSHOT = """
Original: crescite vero in gratia et in cognitione Domini nostri et salvatoris Iesu Christi ipsi gloria et nunc et in die aeternitatis amen.
Translation: But grow in the grace and knowledge of our Lord and Savior Jesus Christ. To him be glory both now and forever! Amen.

Original: certissime ergo sciat omnis domus Israhel quia et Dominum eum et Christum Deus fecit hunc Iesum quem vos crucifixistis
Translation: Therefore let all Israel be assured of this: God has made this Jesus, whom you crucified, both Lord and Messiah.

Original: humiliamini igitur sub potenti manu Dei ut vos exaltet in tempore visitationis
Translation: Humble yourselves, therefore, under God’s mighty hand, that he may lift you up in due time.

Original: unde debuit per omnia fratribus similare ut misericors fieret et fidelis pontifex ad Deum ut repropitiaret delicta populi
Translation: For this reason he had to be made like them, fully human in every way, in order that he might become a merciful and faithful high priest in service to God, and that he might make atonement for the sins of the people.

Original: {}
Translation:
""".strip()

# ─────────────────────────── 4. Classification function ──────────────────────────
def classify_desc(passage: str,
                         *,
                         model_name: str = "llama3.1:70b",
                         url: str       = "http://localhost:11434/api/generate",
                         temperature:   float = 0.0,
                         timeout:       int   = 300) -> int | None:

    if pd.isna(passage) or passage is None:
        return None

    prompt = FEWSHOT.format(passage.strip())

    payload = {
        "model": model_name,
        "system": SYSTEM,
        "prompt": prompt,
        "options": {
            "temperature": temperature,
            "num_predict": 200
        },
        "stream": False
    }


    try:
        resp = requests.post(url, json=payload, timeout=timeout)
    except requests.RequestException as e:
        print("⛔ LLM request failed:", e)
        return None

    if resp.status_code != 200:
        print(f"⛔ LLM HTTP {resp.status_code}: {resp.text[:200]}")
        return None

    try:
        raw = resp.json().get("response", "").strip()
        return raw
    except Exception as e:
        print("⚠︎ Unexpected LLM output:", resp.json())
        return None

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import os

def classify_many(passages, max_workers=2, checkpoint_path="../data/translation_checkpoint.pkl"):
    # Try to load checkpoint
    if os.path.exists(checkpoint_path):
        checkpoint = pd.read_pickle(checkpoint_path)
        results = checkpoint["translation"].tolist()
        start = len(results)
        print(f"Resuming from checkpoint, {start} lines already processed.")
    else:
        results = []
        start = 0

    total = len(passages)
    passages = list(passages)

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        fut_to_i = {ex.submit(classify_desc, p): i for i, p in enumerate(passages[start:], start)}
        completed = start
        for fut in as_completed(fut_to_i):
            i = fut_to_i[fut]
            # Ensure results list is long enough
            while len(results) <= i:
                results.append(None)
            results[i] = fut.result()
            completed += 1
            if completed % 10 == 0 or completed == total:
                print(f"Processed {completed}/{total}")
                # Save checkpoint
                pd.DataFrame({"text": passages[:completed], "translation": results[:completed]}).to_pickle(checkpoint_path)
    # Save final results
    pd.DataFrame({"text": passages, "translation": results}).to_pickle(checkpoint_path)
    return results("df.pkl")

In [None]:
%%time
classify_many(df["text"])