In [1]:

import os
import warnings

# Suppress TensorFlow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress INFO and WARNING messages
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # Disable oneDNN optimizations
os.environ['CUDA_VISIBLE_DEVICES'] = ''   # Use CPU only (optional)
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Disable tokenizer parallelism warnings

# Suppress other warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

import requests

import matplotlib.pyplot as plt
import seaborn as sns

import umap, numpy as np, pandas as pd

import warnings
warnings.filterwarnings('ignore')

# Or for specific warning types:
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)


2025-07-16 20:53:20.195029: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752692000.206145  813944 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752692000.209302  813944 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752692000.217514  813944 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752692000.217524  813944 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752692000.217525  813944 computation_placer.cc:177] computation placer alr

In [3]:
seminarium_df = pd.read_parquet("../../latin-contextual-embeddings/data/large_files/seminarium-in-grela.parquet")

In [5]:
import requests, pandas as pd

# ─────────────────────────── 1. Label map ────────────────────────────
LABEL_MAP = {
    0: "horticulture",
    1: "education"
}

LABEL2ID = {v: k for k, v in LABEL_MAP.items()}

# ─────────────────────────── 2. LLM system prompt with descriptions ──────────────────────────
SYSTEM = """
You are a Latin philologist and semantic analyst.

Your task is to classify how the word “seminarium” is used in a Latin passage.

Choose **exactly one** of the numbered categories below and reply with the **numeric ID only** (no explanations, no punctuation).

Categories:

  0 - horticulture
  1 - education

""".strip()

# ─────────────────────────── 3. Few-shot examples ──────────────────────────
FEWSHOT = """
Passage:
Monasterium quoddam tamquam seminarium religionis et pietatis instituit.
Label: 1

Passage:
Agricola seminarium plantarum ex variis regionibus colligebat.
Label: 0

Passage:
Seminarium novorum sacerdotum Romae erectum est sub auspiciis Gregorii XVI.
Label: 1

Passage:
{0}
Label:
""".strip()

# ─────────────────────────── 4. Classification function ──────────────────────────
def classify_seminarium(passage: str,
                         *,
                         model_name: str = "llama3.1:70b",
                         url: str       = "http://localhost:11434/api/generate",
                         temperature:   float = 0.0,
                         timeout:       int   = 60) -> int | None:
    """Return a numeric label ID (0–9) or None on failure."""
    if pd.isna(passage) or passage is None:
        return None

    prompt = FEWSHOT.format(passage.strip())

    payload = {
        "model": model_name,
        "system": SYSTEM,
        "prompt": prompt,
        "options": {
            "temperature": temperature,
            "num_predict": 3
        },
        "stream": False
    }

    try:
        resp = requests.post(url, json=payload, timeout=timeout)
    except requests.RequestException as e:
        print("⛔ LLM request failed:", e)
        return None

    if resp.status_code != 200:
        print(f"⛔ LLM HTTP {resp.status_code}: {resp.text[:200]}")
        return None

    try:
        raw = resp.json().get("response", "").strip()
        pred_id = int(raw)
        return pred_id if pred_id in LABEL_MAP else None
    except (ValueError, KeyError):
        print("⚠︎ Unexpected LLM output:", resp.json())
        return None

classify_seminarium(seminarium_df.sample()["context_3sents"].iloc[0])

0

In [7]:
%%time
from concurrent.futures import ThreadPoolExecutor, as_completed


def classify_many(passages, max_workers=8):
    results = [None] * len(passages)
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        fut_to_i = {ex.submit(classify_seminarium, p): i
                    for i, p in enumerate(passages)}
        for fut in as_completed(fut_to_i):
            i = fut_to_i[fut]
            results[i] = fut.result()
    return results


seminarium_df["classification_single"] = classify_many(seminarium_df["context_3sents"])


⚠︎ Unexpected LLM output: {'model': 'llama3.1:70b', 'created_at': '2025-07-16T19:05:32.726997584Z', 'response': '2 is not', 'done': True, 'done_reason': 'length', 'context': [128006, 9125, 128007, 271, 2675, 527, 264, 20023, 10891, 16549, 323, 42833, 18738, 382, 7927, 3465, 374, 311, 49229, 1268, 279, 3492, 1054, 325, 1083, 34765, 863, 374, 1511, 304, 264, 20023, 21765, 382, 25017, 3146, 327, 33839, 832, 334, 315, 279, 49926, 11306, 3770, 323, 10052, 449, 279, 3146, 20173, 3110, 1193, 334, 320, 2201, 41941, 11, 912, 62603, 3677, 21645, 1473, 220, 220, 15, 482, 305, 371, 93828, 198, 220, 220, 16, 482, 6873, 128009, 128006, 882, 128007, 271, 12465, 425, 512, 11342, 2352, 2411, 934, 14898, 309, 26555, 87801, 54675, 2411, 13901, 285, 1880, 281, 3978, 3689, 7915, 3159, 627, 2535, 25, 220, 16, 271, 12465, 425, 512, 32, 911, 292, 8083, 54675, 2411, 6136, 277, 372, 506, 4244, 285, 5654, 34495, 4631, 343, 3141, 266, 627, 2535, 25, 220, 15, 271, 12465, 425, 512, 30599, 14080, 2411, 6747, 34106, 

In [11]:
seminarium_df["classification_single_label"] = seminarium_df["classification_single"].apply(lambda x: LABEL_MAP.get(x, "unknown"))

In [12]:
seminarium_df["classification_single_label"].value_counts()

classification_single_label
horticulture    1108
education        681
unknown           14
Name: count, dtype: int64

In [13]:
seminarium_df.to_parquet("../data/large_files/seminarium-in-grela_classified.parquet")