**Thai Semantic Verb Clustering from Selectional Preference**

**Co-occurrence pairs creator**

In [None]:
from google.colab import drive
drive.mount("/content/drive")

!pip -q install -U huggingface_hub datasets

from huggingface_hub import snapshot_download
from pathlib import Path

SAVE_DIR = Path("/content/drive/MyDrive/Colab_Datasets/VV/thaisum_parquet")

snapshot_download(
    repo_id="pythainlp/thaisum",
    repo_type="dataset",
    local_dir=str(SAVE_DIR),
    local_dir_use_symlinks=False,
    allow_patterns=["data/*.parquet"]
)

print("Saved to:", SAVE_DIR)
print("Parquet files found:", len(list((SAVE_DIR / "data").glob("*.parquet"))))

!pip -q install datasets pyahocorasick tqdm pandas numpy

from google.colab import drive
drive.mount("/content/drive")

from datasets import load_dataset
from pathlib import Path
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import ahocorasick
from collections import defaultdict

SAVE_DIR = Path("/content/drive/MyDrive/Colab_Datasets/VV/thaisum_parquet")
DATA_DIR = SAVE_DIR / "data"

OUT_DIR = Path("/content/drive/MyDrive/Colab_Datasets/VV")
OUT_DIR.mkdir(parents=True, exist_ok=True)

VERB_CSV_PATH = Path("/content/drive/MyDrive/Colab_Datasets/VV/Wiktionary_Thai_verb_26122025.csv")
NOUN_CSV_PATH = Path("/content/drive/MyDrive/Colab_Datasets/VV/Wiktionary_Thai_noun_27122025.csv")

ADVERB_CSV_PATH    = Path("/content/drive/MyDrive/Colab_Datasets/VV/Wiktionary_Thai_adverb_02012026.csv")
ADJECTIVE_CSV_PATH = Path("/content/drive/MyDrive/Colab_Datasets/VV/Wiktionary_Thai_adjective_02012026.csv")
PRONOUN_CSV_PATH   = Path("/content/drive/MyDrive/Colab_Datasets/VV/Wiktionary_Thai_pronoun_02012026.csv")

SPLIT = "train"
TEXT_FIELD = "body"
MAX_ROWS = -1
CHUNK_WRITE = 200_000
SKIP_IF_OUTPUT_EXISTS = True

ENABLE = {
    "verb-verb": True,
    "noun-verb": True,
    "verb-noun": True,
    "verb-(noun)-verb": True,
    "verb-(verb)-verb": True,
    "adverb-verb": True,
    "verb-adverb": True,
    "verb-(adverb)-verb": True,
    "adjective-verb": True,
    "verb-adjective": True,
    "verb-(adjective)-verb": True,
    "pronoun-verb": True,
    "verb-pronoun": True,
    "verb-(pronoun)-verb": True,
}

ALLOW_OVERLAP_MIDDLE = {
    "noun": True,
    "adverb": True,
    "adjective": True,
    "pronoun": True,
}

OUT = {
    "verb-verb": OUT_DIR / "Thaisum_verb-verb_pairs.csv",

    "noun-verb": OUT_DIR / "Thaisum_noun-verb_pairs.csv",
    "verb-noun": OUT_DIR / "Thaisum_verb-noun_pairs.csv",
    "verb-(noun)-verb": OUT_DIR / "Thaisum_verb-(noun)-verb_pairs.csv",

    "verb-(verb)-verb": OUT_DIR / "Thaisum_verb-(verb)-verb_pairs.csv",

    "adverb-verb": OUT_DIR / "Thaisum_adverb-verb_pairs.csv",
    "verb-adverb": OUT_DIR / "Thaisum_verb-adverb_pairs.csv",
    "verb-(adverb)-verb": OUT_DIR / "Thaisum_verb-(adverb)-verb_pairs.csv",

    "adjective-verb": OUT_DIR / "Thaisum_adjective-verb_pairs.csv",
    "verb-adjective": OUT_DIR / "Thaisum_verb-adjective_pairs.csv",
    "verb-(adjective)-verb": OUT_DIR / "Thaisum_verb-(adjective)-verb_pairs.csv",

    "pronoun-verb": OUT_DIR / "Thaisum_pronoun-verb_pairs.csv",
    "verb-pronoun": OUT_DIR / "Thaisum_verb-pronoun_pairs.csv",
    "verb-(pronoun)-verb": OUT_DIR / "Thaisum_verb-(pronoun)-verb_pairs.csv",
}

if SKIP_IF_OUTPUT_EXISTS:
    for k, path in OUT.items():
        if ENABLE.get(k, False) and path.exists():
            print(f"[SKIP] {k} enabled but output exists: {path}")
            ENABLE[k] = False

print("\nEnabled jobs:")
for k, v in ENABLE.items():
    if v:
        print("  ✅", k)

THAI_RE = re.compile(r"^[\u0E00-\u0E7F ]+$")

def load_wordlist_onecol(csv_path: Path, label: str):
    wdf = pd.read_csv(csv_path, header=None, names=[label], encoding="utf-8-sig")
    series = (
        wdf[label]
        .astype(str)
        .str.replace("\ufeff", "", regex=False)
        .str.strip()
        .replace({"": np.nan, "nan": np.nan, "None": np.nan})
        .dropna()
    )
    words = [w for w in series.unique().tolist() if THAI_RE.match(w)]
    if not words:
        raise ValueError(f"No usable Thai {label}s found in {csv_path}")
    return words

def build_automaton(words):
    A = ahocorasick.Automaton()
    for i, w in enumerate(words):
        A.add_word(w, i)
    A.make_automaton()
    lens = np.array([len(w) for w in words], dtype=np.int32)
    return A, lens

data_files = {
    "train": sorted(str(p) for p in DATA_DIR.glob("train-*.parquet")),
    "validation": sorted(str(p) for p in DATA_DIR.glob("validation-*.parquet")),
    "test": sorted(str(p) for p in DATA_DIR.glob("test-*.parquet")),
}
if not data_files["train"]:
    raise FileNotFoundError(f"No train parquet files found in: {DATA_DIR}")

ds = load_dataset("parquet", data_files=data_files)
print(ds)

VERBS = load_wordlist_onecol(VERB_CSV_PATH, "verb")
V = len(VERBS)
verb_set = set(VERBS)
print("Loaded verbs:", V)

need_nouns = any(ENABLE.get(k, False) for k in ["noun-verb", "verb-noun", "verb-(noun)-verb"])
if need_nouns:
    NOUNS = load_wordlist_onecol(NOUN_CSV_PATH, "noun")
    N = len(NOUNS)
    noun_set = set(NOUNS)
    OVERLAP_NOUN_VERB = verb_set.intersection(noun_set)
    noun_in_overlap = np.array([w in OVERLAP_NOUN_VERB for w in NOUNS], dtype=bool)
    print("Loaded nouns:", N, " overlap(noun∩verb):", len(OVERLAP_NOUN_VERB))
else:
    NOUNS = None
    N = 0
    noun_in_overlap = None

POS_LISTS = {}
for pos, path in [("adverb", ADVERB_CSV_PATH), ("adjective", ADJECTIVE_CSV_PATH), ("pronoun", PRONOUN_CSV_PATH)]:
    need_pos = any(ENABLE.get(k, False) for k in [f"{pos}-verb", f"verb-{pos}", f"verb-({pos})-verb"])
    if need_pos:
        words = load_wordlist_onecol(path, pos)
        overlap = verb_set.intersection(set(words))
        POS_LISTS[pos] = {
            "words": words,
            "X": len(words),
            "overlap_set": overlap,
            "x_in_overlap": np.array([w in overlap for w in words], dtype=bool)
        }
        print(f"Loaded {pos}s:", len(words), f" overlap({pos}∩verb):", len(overlap))

A_verb, lens_verb = build_automaton(VERBS)

if need_nouns:
    A_noun, lens_noun = build_automaton(NOUNS)

for pos, spec in POS_LISTS.items():
    A_x, lens_x = build_automaton(spec["words"])
    spec["A"] = A_x
    spec["lens"] = lens_x

verbs_arr = np.array(VERBS, dtype=object)

if ENABLE.get("verb-verb", False):
    TOTAL_VV = V * V
    occ_vv = np.zeros(TOTAL_VV, dtype=np.int64)
    txt_vv = np.zeros(TOTAL_VV, dtype=np.int32)

if ENABLE.get("noun-verb", False):
    TOTAL_NV = N * V
    occ_nv = np.zeros(TOTAL_NV, dtype=np.int64)
    txt_nv = np.zeros(TOTAL_NV, dtype=np.int32)

if ENABLE.get("verb-noun", False):
    TOTAL_VN = V * N
    occ_vn = np.zeros(TOTAL_VN, dtype=np.int64)
    txt_vn = np.zeros(TOTAL_VN, dtype=np.int32)

if ENABLE.get("verb-(noun)-verb", False):
    TOTAL_VV = V * V
    occ_vnv = np.zeros(TOTAL_VV, dtype=np.int64)
    txt_vnv = np.zeros(TOTAL_VV, dtype=np.int32)

if ENABLE.get("verb-(verb)-verb", False):
    TOTAL_VV = V * V
    occ_vvv = np.zeros(TOTAL_VV, dtype=np.int64)
    txt_vvv = np.zeros(TOTAL_VV, dtype=np.int32)

for pos, spec in POS_LISTS.items():
    X = spec["X"]
    if ENABLE.get(f"{pos}-verb", False):
        spec["occ_xv"] = np.zeros(X * V, dtype=np.int64)
        spec["txt_xv"] = np.zeros(X * V, dtype=np.int32)
    if ENABLE.get(f"verb-{pos}", False):
        spec["occ_vx"] = np.zeros(V * X, dtype=np.int64)
        spec["txt_vx"] = np.zeros(V * X, dtype=np.int32)
    if ENABLE.get(f"verb-({pos})-verb", False):
        spec["occ_vxv"] = np.zeros(V * V, dtype=np.int64)
        spec["txt_vxv"] = np.zeros(V * V, dtype=np.int32)

any_enabled = any(ENABLE.values())
if not any_enabled:
    print("\nNothing enabled. Exiting without scanning.")
    raise SystemExit

data = ds[SPLIT]
if MAX_ROWS is None or MAX_ROWS == -1:
    iterable = data
    total_for_tqdm = len(data)
else:
    n_rows = min(int(MAX_ROWS), len(data))
    iterable = data.select(range(n_rows))
    total_for_tqdm = n_rows

for row in tqdm(iterable, total=total_for_tqdm, desc=f"Scanning {SPLIT}"):
    text = row.get(TEXT_FIELD, "") or ""

    v_starts = defaultdict(list)
    v_ends   = defaultdict(list)
    v_starts_with_end = defaultdict(list)
    for end_pos, vid in A_verb.iter(text):
        start_pos = end_pos - lens_verb[vid] + 1
        v_starts[start_pos].append(vid)
        v_ends[end_pos].append(vid)
        v_starts_with_end[start_pos].append((vid, end_pos))

    if need_nouns:
        n_starts = defaultdict(list)
        n_ends   = defaultdict(list)
        n_starts_with_end = defaultdict(list)
        for end_pos, nid in A_noun.iter(text):
            start_pos = end_pos - lens_noun[nid] + 1
            n_starts[start_pos].append(nid)
            n_ends[end_pos].append(nid)
            n_starts_with_end[start_pos].append((nid, end_pos))

    if ENABLE.get("verb-verb", False):
        seen = set()
        for e, left_vs in v_ends.items():
            right_vs = v_starts.get(e + 1)
            if not right_vs:
                continue
            for v1 in left_vs:
                base = v1 * V
                for v2 in right_vs:
                    code = base + v2
                    occ_vv[code] += 1
                    seen.add(code)
        for code in seen:
            txt_vv[code] += 1

    if ENABLE.get("verb-(verb)-verb", False):
        seen = set()
        for e, left_vs in v_ends.items():
            mids = v_starts_with_end.get(e + 1)
            if not mids:
                continue
            for v1 in left_vs:
                base = v1 * V
                for (vmid, mid_end) in mids:
                    right_vs = v_starts.get(mid_end + 1)
                    if not right_vs:
                        continue
                    for v2 in right_vs:
                        code = base + v2
                        occ_vvv[code] += 1
                        seen.add(code)
        for code in seen:
            txt_vvv[code] += 1

    if ENABLE.get("noun-verb", False):
        seen = set()
        for e, left_ns in n_ends.items():
            right_vs = v_starts.get(e + 1)
            if not right_vs:
                continue
            for nid in left_ns:
                if noun_in_overlap[nid]:
                    continue
                base = nid * V
                for vid in right_vs:
                    code = base + vid
                    occ_nv[code] += 1
                    seen.add(code)
        for code in seen:
            txt_nv[code] += 1

    if ENABLE.get("verb-noun", False):
        seen = set()
        for e, left_vs in v_ends.items():
            right_ns = n_starts.get(e + 1)
            if not right_ns:
                continue
            for vid in left_vs:
                base = vid * N
                for nid in right_ns:
                    if noun_in_overlap[nid]:
                        continue
                    code = base + nid
                    occ_vn[code] += 1
                    seen.add(code)
        for code in seen:
            txt_vn[code] += 1

    if ENABLE.get("verb-(noun)-verb", False):
        allow_mid = ALLOW_OVERLAP_MIDDLE.get("noun", True)
        seen = set()
        for e, left_vs in v_ends.items():
            mids = n_starts_with_end.get(e + 1)
            if not mids:
                continue
            for v1 in left_vs:
                base = v1 * V
                for (nid, n_end) in mids:
                    if (not allow_mid) and noun_in_overlap[nid]:
                        continue
                    right_vs = v_starts.get(n_end + 1)
                    if not right_vs:
                        continue
                    for v2 in right_vs:
                        code = base + v2
                        occ_vnv[code] += 1
                        seen.add(code)
        for code in seen:
            txt_vnv[code] += 1

    for pos, spec in POS_LISTS.items():
        A_x = spec["A"]
        lens_x = spec["lens"]
        X = spec["X"]
        x_in_overlap = spec["x_in_overlap"]
        allow_mid = ALLOW_OVERLAP_MIDDLE.get(pos, True)

        x_starts = defaultdict(list)
        x_ends   = defaultdict(list)
        x_starts_with_end = defaultdict(list)
        for end_pos, xid in A_x.iter(text):
            start_pos = end_pos - lens_x[xid] + 1
            x_starts[start_pos].append(xid)
            x_ends[end_pos].append(xid)
            x_starts_with_end[start_pos].append((xid, end_pos))

        if ENABLE.get(f"{pos}-verb", False):
            seen = set()
            for e, left_xs in x_ends.items():
                right_vs = v_starts.get(e + 1)
                if not right_vs:
                    continue
                for xid in left_xs:
                    if x_in_overlap[xid]:
                        continue
                    base = xid * V
                    for vid in right_vs:
                        code = base + vid
                        spec["occ_xv"][code] += 1
                        seen.add(code)
            for code in seen:
                spec["txt_xv"][code] += 1

        if ENABLE.get(f"verb-{pos}", False):
            seen = set()
            for e, left_vs in v_ends.items():
                right_xs = x_starts.get(e + 1)
                if not right_xs:
                    continue
                for vid in left_vs:
                    base = vid * X
                    for xid in right_xs:
                        if x_in_overlap[xid]:
                            continue
                        code = base + xid
                        spec["occ_vx"][code] += 1
                        seen.add(code)
            for code in seen:
                spec["txt_vx"][code] += 1

        if ENABLE.get(f"verb-({pos})-verb", False):
            seen = set()
            for e, left_vs in v_ends.items():
                mids = x_starts_with_end.get(e + 1)
                if not mids:
                    continue
                for v1 in left_vs:
                    base = v1 * V
                    for (xid, x_end) in mids:
                        if (not allow_mid) and x_in_overlap[xid]:
                            continue
                        right_vs = v_starts.get(x_end + 1)
                        if not right_vs:
                            continue
                        for v2 in right_vs:
                            code = base + v2
                            spec["occ_vxv"][code] += 1
                            seen.add(code)
            for code in seen:
                spec["txt_vxv"][code] += 1

print("Done counting.")

def write_adj_left_right(left_words, right_words, occ, txt, out_path, left_name, right_name,
                         skip_left_mask=None, total_chunk=CHUNK_WRITE):
    L = len(left_words)
    R = len(right_words)
    left_arr = np.array(left_words, dtype=object)
    right_arr = np.array(right_words, dtype=object)
    TOTAL = L * R

    wrote_header = False
    for start in tqdm(range(0, TOTAL, total_chunk), desc=f"Writing {left_name}-{right_name} CSV"):
        end = min(start + total_chunk, TOTAL)
        idx = np.arange(start, end, dtype=np.int64)
        l_ids = (idx // R).astype(np.int64)
        r_ids = (idx %  R).astype(np.int64)

        keep = np.ones_like(l_ids, dtype=bool)
        if skip_left_mask is not None:
            keep = ~skip_left_mask[l_ids]
        if not np.any(keep):
            continue

        l_ids_k = l_ids[keep]
        r_ids_k = r_ids[keep]
        codes   = (l_ids_k * R + r_ids_k).astype(np.int64)

        left_col = left_arr[l_ids_k]
        right_col = right_arr[r_ids_k]

        df = pd.DataFrame({
            left_name: left_col,
            right_name: right_col,
            "needle": (left_col + right_col),
            "total_occurrences": occ[codes],
            "texts_with_match": txt[codes],
        })

        df.to_csv(
            out_path,
            mode="w" if not wrote_header else "a",
            header=(not wrote_header),
            index=False,
            encoding="utf-8-sig"
        )
        wrote_header = True

    print("Saved:", out_path)

def write_vv_like(occ, txt, out_path, marker):
    TOTAL = V * V
    for start in tqdm(range(0, TOTAL, CHUNK_WRITE), desc=f"Writing {out_path.name}"):
        end = min(start + CHUNK_WRITE, TOTAL)
        idx = np.arange(start, end, dtype=np.int64)
        p_ids = (idx // V).astype(np.int64)
        s_ids = (idx %  V).astype(np.int64)

        pref = verbs_arr[p_ids]
        suff = verbs_arr[s_ids]

        df = pd.DataFrame({
            "prefix": pref,
            "suffix": suff,
            "needle": (pref + marker + suff),
            "total_occurrences": occ[start:end],
            "texts_with_match": txt[start:end],
        })

        df.to_csv(
            out_path,
            mode="w" if start == 0 else "a",
            header=(start == 0),
            index=False,
            encoding="utf-8-sig"
        )
    print("Saved:", out_path)

if ENABLE.get("verb-verb", False):
    write_vv_like(occ_vv, txt_vv, OUT["verb-verb"], marker="")

if ENABLE.get("verb-(verb)-verb", False):
    write_vv_like(occ_vvv, txt_vvv, OUT["verb-(verb)-verb"], marker="<V>")

if ENABLE.get("noun-verb", False):
    write_adj_left_right(
        left_words=NOUNS, right_words=VERBS,
        occ=occ_nv, txt=txt_nv,
        out_path=OUT["noun-verb"],
        left_name="noun", right_name="verb",
        skip_left_mask=noun_in_overlap
    )

if ENABLE.get("verb-noun", False):
    out_path = OUT["verb-noun"]
    wrote_header = False
    for start in tqdm(range(0, V * N, CHUNK_WRITE), desc="Writing verb-noun CSV"):
        end = min(start + CHUNK_WRITE, V * N)
        idx = np.arange(start, end, dtype=np.int64)

        v_ids = (idx // N).astype(np.int64)
        n_ids = (idx %  N).astype(np.int64)

        keep = ~noun_in_overlap[n_ids]
        if not np.any(keep):
            continue

        v_ids_k = v_ids[keep]
        n_ids_k = n_ids[keep]
        codes   = (v_ids_k * N + n_ids_k).astype(np.int64)

        verb_col = verbs_arr[v_ids_k]
        noun_col = np.array(NOUNS, dtype=object)[n_ids_k]

        df = pd.DataFrame({
            "verb": verb_col,
            "noun": noun_col,
            "needle": (verb_col + noun_col),
            "total_occurrences": occ_vn[codes],
            "texts_with_match": txt_vn[codes],
        })

        df.to_csv(
            out_path,
            mode="w" if not wrote_header else "a",
            header=(not wrote_header),
            index=False,
            encoding="utf-8-sig"
        )
        wrote_header = True
    print("Saved:", out_path)

if ENABLE.get("verb-(noun)-verb", False):
    write_vv_like(occ_vnv, txt_vnv, OUT["verb-(noun)-verb"], marker="<N>")

for pos, spec in POS_LISTS.items():
    if ENABLE.get(f"{pos}-verb", False):
        write_adj_left_right(
            left_words=spec["words"], right_words=VERBS,
            occ=spec["occ_xv"], txt=spec["txt_xv"],
            out_path=OUT[f"{pos}-verb"],
            left_name=pos, right_name="verb",
            skip_left_mask=spec["x_in_overlap"]
        )

    if ENABLE.get(f"verb-{pos}", False):
        out_path = OUT[f"verb-{pos}"]
        X = spec["X"]
        x_in_overlap = spec["x_in_overlap"]
        pos_arr = np.array(spec["words"], dtype=object)

        wrote_header = False
        for start in tqdm(range(0, V * X, CHUNK_WRITE), desc=f"Writing verb-{pos} CSV"):
            end = min(start + CHUNK_WRITE, V * X)
            idx = np.arange(start, end, dtype=np.int64)

            v_ids = (idx // X).astype(np.int64)
            x_ids = (idx %  X).astype(np.int64)

            keep = ~x_in_overlap[x_ids]
            if not np.any(keep):
                continue

            v_ids_k = v_ids[keep]
            x_ids_k = x_ids[keep]
            codes   = (v_ids_k * X + x_ids_k).astype(np.int64)

            verb_col = verbs_arr[v_ids_k]
            pos_col  = pos_arr[x_ids_k]

            df = pd.DataFrame({
                "verb": verb_col,
                pos: pos_col,
                "needle": (verb_col + pos_col),
                "total_occurrences": spec["occ_vx"][codes],
                "texts_with_match": spec["txt_vx"][codes],
            })

            df.to_csv(
                out_path,
                mode="w" if not wrote_header else "a",
                header=(not wrote_header),
                index=False,
                encoding="utf-8-sig"
            )
            wrote_header = True
        print("Saved:", out_path)

    if ENABLE.get(f"verb-({pos})-verb", False):
        marker = f"<{pos[:1].upper()}>"
        write_vv_like(spec["occ_vxv"], spec["txt_vxv"], OUT[f"verb-({pos})-verb"], marker=marker)

print("\nALL DONE ✅")

**Cluster creator**

In [None]:
!pip -q install pandas numpy scipy scikit-learn tqdm pynndescent matplotlib ipywidgets

import os, re, time, json, hashlib
from pathlib import Path

import numpy as np
import pandas as pd
import scipy.sparse as sp

from tqdm.auto import tqdm
from sklearn.preprocessing import normalize
from sklearn.manifold import SpectralEmbedding
from sklearn.metrics import normalized_mutual_info_score
from pynndescent import NNDescent

_UI_AVAILABLE = True
try:
    import ipywidgets as widgets
    from IPython.display import display
except Exception:
    _UI_AVAILABLE = False

class ProgressUI:
    def __init__(self, enabled: bool = True, show_log: bool = False):
        self.enabled = bool(enabled) and _UI_AVAILABLE
        self.show_log = bool(show_log)
        self._overall_total = 1
        self._overall_value = 0
        self._run_total = 1
        self._run_value = 0

        if not self.enabled:
            self.box = None
            return

        self.status = widgets.HTML(value="<b>Status:</b> idle")
        self.overall = widgets.IntProgress(value=0, min=0, max=1, description="Overall", bar_style="")
        self.runbar = widgets.IntProgress(value=0, min=0, max=1, description="Run", bar_style="")
        self._log = widgets.Output(layout={"border": "1px solid #ddd", "max_height": "160px", "overflow_y": "auto"}) if self.show_log else None

        items = [self.status, self.overall, self.runbar]
        if self._log is not None:
            items.append(self._log)

        self.box = widgets.VBox(items)
        display(self.box)

    def set_status(self, text: str):
        if self.enabled:
            self.status.value = f"<b>Status:</b> {text}"
        else:
            print(text)

    def log(self, text: str):
        if not self.enabled or self._log is None:
            return
        with self._log:
            print(text)

    def start_overall(self, total_steps: int, text: str = "starting..."):
        self._overall_total = int(max(1, total_steps))
        self._overall_value = 0
        if self.enabled:
            self.overall.max = self._overall_total
            self.overall.value = 0
            self.overall.bar_style = ""
        self.set_status(text)

    def step_overall(self, n: int = 1, text: str | None = None):
        self._overall_value = min(self._overall_total, self._overall_value + int(n))
        if self.enabled:
            self.overall.value = self._overall_value
        if text is not None:
            self.set_status(text)

    def start_run(self, total_steps: int, text: str = "run starting..."):
        self._run_total = int(max(1, total_steps))
        self._run_value = 0
        if self.enabled:
            self.runbar.max = self._run_total
            self.runbar.value = 0
            self.runbar.bar_style = ""
        self.set_status(text)

    def step_run(self, n: int = 1, text: str | None = None):
        self._run_value = min(self._run_total, self._run_value + int(n))
        if self.enabled:
            self.runbar.value = self._run_value
        if text is not None:
            self.set_status(text)

    def mark_done(self, text: str = "DONE"):
        if self.enabled:
            self.overall.bar_style = "success"
            self.runbar.bar_style = "success"
        self.set_status(text)

_MAX_COMPONENT = 140

def _safe_name(s: str, max_len: int = _MAX_COMPONENT) -> str:
    s = str(s).strip()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^A-Za-z0-9._\-]+", "_", s)
    s = re.sub(r"_+", "_", s)
    if not s:
        s = "x"
    return s[:max_len] if len(s) > max_len else s

def _now_str():
    return time.strftime("%Y%m%d_%H%M%S")

def _ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)
    return p

def _atomic_replace(tmp_path: Path, final_path: Path):
    final_path.parent.mkdir(parents=True, exist_ok=True)
    os.replace(str(tmp_path), str(final_path))

def _tmp_path(final_path: Path, ext: str | None = None) -> Path:
    """
    Make a short temp filename in the same directory, to avoid Drive filename limits.
    """
    ext = ext if ext is not None else final_path.suffix
    ts = int(time.time() * 1_000_000)
    # leading dot keeps it hidden; name stays short
    return final_path.with_name(f".tmp_{ts}_{os.getpid()}{ext}")

def _atomic_write_text(path: Path, text: str):
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = _tmp_path(path, ".tmp")
    tmp.write_text(text, encoding="utf-8")
    _atomic_replace(tmp, path)

def _atomic_write_json(path: Path, obj: dict):
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = _tmp_path(path, ".tmp")
    tmp.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
    _atomic_replace(tmp, path)

def _read_json(path: Path) -> dict | None:
    try:
        if path.exists():
            return json.loads(path.read_text(encoding="utf-8"))
    except Exception:
        pass
    return None

def _atomic_write_df_csv(df: pd.DataFrame, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = _tmp_path(path, ".csv")
    df.to_csv(tmp, index=False, encoding="utf-8-sig")
    _atomic_replace(tmp, path)

def _safe_read_csv(path: Path) -> pd.DataFrame:
    try:
        if path.exists():
            return pd.read_csv(path, encoding="utf-8-sig")
    except Exception:
        pass
    return pd.DataFrame()

def _atomic_save_npz_array(path: Path, **arrays):
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = _tmp_path(path, ".npz")
    np.savez_compressed(tmp, **arrays)
    _atomic_replace(tmp, path)

def _atomic_save_sparse_npz(path: Path, mat: sp.spmatrix):
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = _tmp_path(path, ".npz")
    sp.save_npz(tmp, mat)
    _atomic_replace(tmp, path)

def _make_seeds(base_seed: int, n: int):
    rng = np.random.RandomState(int(base_seed))
    return [int(x) for x in rng.randint(0, 2**31 - 1, size=int(n), dtype=np.int64).tolist()]

def _fingerprint(cfg: dict, *, features: list[str], knn: int, k: int, emb_dims: int) -> str:
    relevant = {
        "features": list(features),
        "knn": int(knn),
        "k": int(k),
        "emb_dims": int(emb_dims),
        "data": dict(cfg["DATA"]),
        "graph": dict(cfg["GRAPH"]),
        "seeds": dict(cfg["SEEDS"]),
        "mncut": dict(cfg["MNCUT"]),
    }
    blob = json.dumps(relevant, sort_keys=True, ensure_ascii=False).encode("utf-8")
    return hashlib.md5(blob).hexdigest()

def _feature_tag(feature_names: list[str], *, all_feature_names: list[str] | None = None) -> str:
    """
    Short, stable tag for filenames.
    - If all features selected -> ALL
    - Else -> F{n}_{hash8}
    """
    if all_feature_names is not None and set(feature_names) == set(all_feature_names):
        return "ALL"
    s = "||".join(sorted(feature_names))
    h = hashlib.md5(s.encode("utf-8")).hexdigest()[:8]
    return f"F{len(feature_names)}_{h}"

def load_verbs(verb_list_csv: Path):
    verbs = (
        pd.read_csv(verb_list_csv, header=None, encoding="utf-8-sig")[0]
        .astype(str).str.replace("\ufeff", "", regex=False).str.strip()
    )
    verbs = verbs[verbs != ""].tolist()
    vid = {v: i for i, v in enumerate(verbs)}
    return verbs, vid

def read_verb_context_matrix(path: Path, *, vid: dict, V: int,
                            verb_col: str, ctx_col: str,
                            use_col: str, min_count: float, chunk: int,
                            weight_transform: str):
    if not path.exists():
        raise FileNotFoundError(path)

    ctx2id = {}
    rows_parts, cols_parts, data_parts = [], [], []
    usecols = [verb_col, ctx_col, use_col]

    for df in tqdm(pd.read_csv(path, usecols=usecols, chunksize=int(chunk), encoding="utf-8-sig"),
                   desc=f"Reading {path.name}"):
        df = df.dropna()
        if df.empty:
            continue
        df = df[df[use_col] >= min_count]
        if df.empty:
            continue

        v = df[verb_col].astype(str).map(vid)
        m = v.notna()
        if not m.any():
            continue

        v = v[m].astype(np.int32).to_numpy()
        ctx_words = df.loc[m, ctx_col].astype(str).to_numpy()
        w = df.loc[m, use_col].to_numpy(np.float32)

        if weight_transform == "raw":
            pass
        elif weight_transform == "log1p":
            w = np.log1p(w).astype(np.float32)
        elif weight_transform == "sqrt":
            w = np.sqrt(w).astype(np.float32)
        else:
            raise ValueError(f"Unknown weight_transform: {weight_transform}")

        c = np.empty_like(v, dtype=np.int32)
        for i, cw in enumerate(ctx_words):
            j = ctx2id.get(cw)
            if j is None:
                j = len(ctx2id)
                ctx2id[cw] = j
            c[i] = j

        rows_parts.append(v); cols_parts.append(c); data_parts.append(w)

    if not rows_parts:
        raise RuntimeError(f"No usable edges in {path.name}.")

    rows = np.concatenate(rows_parts)
    cols = np.concatenate(cols_parts)
    data = np.concatenate(data_parts)

    X = sp.coo_matrix((data, (rows, cols)), shape=(V, len(ctx2id)), dtype=np.float32).tocsr()
    X.sum_duplicates()
    return X

def read_verb_verb_matrix(path: Path, *, vid: dict, V: int,
                          prefix_col: str, suffix_col: str,
                          use_col: str, min_count: float, chunk: int,
                          weight_transform: str):
    if not path.exists():
        raise FileNotFoundError(path)

    rows_parts, cols_parts, data_parts = [], [], []
    usecols = [prefix_col, suffix_col, use_col]

    for df in tqdm(pd.read_csv(path, usecols=usecols, chunksize=int(chunk), encoding="utf-8-sig"),
                   desc=f"Reading {path.name}"):
        df = df.dropna()
        if df.empty:
            continue
        df = df[df[use_col] >= min_count]
        if df.empty:
            continue

        r = df[prefix_col].astype(str).map(vid)
        c = df[suffix_col].astype(str).map(vid)
        m = r.notna() & c.notna()
        if not m.any():
            continue

        r = r[m].astype(np.int32).to_numpy()
        c = c[m].astype(np.int32).to_numpy()
        w = df.loc[m, use_col].to_numpy(np.float32)

        if weight_transform == "raw":
            pass
        elif weight_transform == "log1p":
            w = np.log1p(w).astype(np.float32)
        elif weight_transform == "sqrt":
            w = np.sqrt(w).astype(np.float32)
        else:
            raise ValueError(f"Unknown weight_transform: {weight_transform}")

        rows_parts.append(r); cols_parts.append(c); data_parts.append(w)

    if not rows_parts:
        raise RuntimeError(f"No usable edges in {path.name}.")

    rows = np.concatenate(rows_parts)
    cols = np.concatenate(cols_parts)
    data = np.concatenate(data_parts)

    A = sp.coo_matrix((data, (rows, cols)), shape=(V, V), dtype=np.float32).tocsr()
    A.sum_duplicates()
    A.setdiag(0)
    A.eliminate_zeros()
    return A

def apply_verbverb_mode(A: sp.csr_matrix, mode: str):
    if mode == "out":
        return A
    if mode == "in":
        return A.T.tocsr()
    if mode == "both":
        return sp.hstack([A, A.T], format="csr")
    raise ValueError("VERBVERB_MODE must be out/in/both")

def default_feature_specs(base_dir: str):
    BASE = Path(base_dir)
    return {
        "verb-verb":             {"path": str(BASE / "Thaisum_verb-verb_pairs.csv"),             "kind": "verbverb"},
        "verb-(noun)-verb":      {"path": str(BASE / "Thaisum_verb-(noun)-verb_pairs.csv"),      "kind": "verbverb"},
        "verb-(verb)-verb":      {"path": str(BASE / "Thaisum_verb-(verb)-verb_pairs.csv"),      "kind": "verbverb"},
        "verb-(adjective)-verb": {"path": str(BASE / "Thaisum_verb-(adjective)-verb_pairs.csv"), "kind": "verbverb"},
        "verb-(adverb)-verb":    {"path": str(BASE / "Thaisum_verb-(adverb)-verb_pairs.csv"),    "kind": "verbverb"},
        "verb-(pronoun)-verb":   {"path": str(BASE / "Thaisum_verb-(pronoun)-verb_pairs.csv"),   "kind": "verbverb"},

        "noun-verb":             {"path": str(BASE / "Thaisum_noun-verb_pairs.csv"),             "kind": "verbctx", "verb_col": "verb", "ctx_col": "noun"},
        "adjective-verb":        {"path": str(BASE / "Thaisum_adjective-verb_pairs.csv"),        "kind": "verbctx", "verb_col": "verb", "ctx_col": "adjective"},
        "adverb-verb":           {"path": str(BASE / "Thaisum_adverb-verb_pairs.csv"),           "kind": "verbctx", "verb_col": "verb", "ctx_col": "adverb"},
        "pronoun-verb":          {"path": str(BASE / "Thaisum_pronoun-verb_pairs.csv"),          "kind": "verbctx", "verb_col": "verb", "ctx_col": "pronoun"},

        "verb-noun":             {"path": str(BASE / "Thaisum_verb-noun_pairs.csv"),             "kind": "verbctx", "verb_col": "verb", "ctx_col": "noun"},
        "verb-adjective":        {"path": str(BASE / "Thaisum_verb-adjective_pairs.csv"),        "kind": "verbctx", "verb_col": "verb", "ctx_col": "adjective"},
        "verb-adverb":           {"path": str(BASE / "Thaisum_verb-adverb_pairs.csv"),           "kind": "verbctx", "verb_col": "verb", "ctx_col": "adverb"},
        "verb-pronoun":          {"path": str(BASE / "Thaisum_verb-pronoun_pairs.csv"),          "kind": "verbctx", "verb_col": "verb", "ctx_col": "pronoun"},
    }

def build_graph_and_embedding(X: sp.csr_matrix, *, cfg, knn: int, emb_dims: int, ui: ProgressUI | None = None):
    if ui: ui.step_run(0, "Finding active verbs...")
    active = X.getnnz(axis=1) > 0
    active_ids = np.where(active)[0].astype(np.int32)
    X_act = X[active]
    n = X_act.shape[0]
    if n < int(cfg["GRAPH"]["MIN_ACTIVE_VERBS"]):
        raise RuntimeError(f"too_few_active (n={n})")

    if ui: ui.step_run(0, "Normalizing rows...")
    Xn = normalize(X_act, axis=1)

    k_eff = int(min(int(knn), n - 1))
    if k_eff < 1:
        raise RuntimeError("too_few_active_neighbors")

    if ui: ui.step_run(0, f"Building KNN graph (k={k_eff})...")
    nn = NNDescent(
        Xn,
        n_neighbors=int(k_eff + 1),
        metric=str(cfg["GRAPH"]["KNN_METRIC"]),
        random_state=int(cfg["SEEDS"]["EMB_RANDOM_STATE"]),
        n_jobs=int(cfg["GRAPH"]["N_JOBS"]),
    )
    knn_idx, knn_dist = nn.neighbor_graph
    knn_sim = 1.0 - knn_dist

    I = np.repeat(np.arange(n, dtype=np.int32), k_eff)
    J = knn_idx[:, 1:k_eff+1].reshape(-1).astype(np.int32)
    S = knn_sim[:, 1:k_eff+1].reshape(-1).astype(np.float32)

    pos = S > float(cfg["GRAPH"]["SIM_MIN"])
    I, J, S = I[pos], J[pos], S[pos]

    if ui: ui.step_run(0, "Building symmetric affinity matrix...")
    W = sp.coo_matrix((S, (I, J)), shape=(n, n), dtype=np.float32).tocsr()
    if cfg["GRAPH"]["MAKE_SYMMETRIC"]:
        W = (W + W.T).tocsr()
    W.sum_duplicates()

    W_diag0 = W.copy().tocsr()
    W_diag0.setdiag(0)
    W_diag0.eliminate_zeros()

    diag_eps = float(cfg["GRAPH"]["DIAG_EPS"])
    W_emb = (W + sp.eye(n, dtype=np.float32) * diag_eps).tocsr()

    emb_dim_used = int(min(int(emb_dims), n - 2))
    if emb_dim_used < 2:
        raise RuntimeError(f"n too small for embedding: n={n}, emb_dim_used={emb_dim_used}")

    if ui: ui.step_run(0, f"Spectral embedding (dim={emb_dim_used})...")
    Z = SpectralEmbedding(
        n_components=emb_dim_used,
        affinity="precomputed",
        random_state=int(cfg["SEEDS"]["EMB_RANDOM_STATE"]),
    ).fit_transform(W_emb)

    Z = normalize(Z, axis=1)
    return active_ids, Z, knn_idx, knn_sim, W_diag0, emb_dim_used

def _load_embed_cache(embed_npz: Path, wdiag_npz: Path, expected_fp: str):
    if not (embed_npz.exists() and wdiag_npz.exists()):
        return None
    meta = _read_json(embed_npz.with_suffix(".meta.json"))
    if not meta or meta.get("fingerprint") != expected_fp:
        return None
    try:
        npz = np.load(embed_npz, allow_pickle=False)
        active_ids = npz["active_ids"].astype(np.int32)
        Z = npz["Z"].astype(np.float32)
        knn_idx = npz["knn_idx"].astype(np.int32)
        knn_sim = npz["knn_sim"].astype(np.float32)
        emb_dim_used = int(npz["emb_dim_used"][0])
        W_diag0 = sp.load_npz(wdiag_npz).tocsr()
        return active_ids, Z, knn_idx, knn_sim, W_diag0, emb_dim_used
    except Exception:
        return None

def _save_embed_cache(embed_npz: Path, wdiag_npz: Path, *, fp: str,
                      active_ids, Z, knn_idx, knn_sim, emb_dim_used, W_diag0):
    _atomic_save_npz_array(
        embed_npz,
        active_ids=np.asarray(active_ids, dtype=np.int32),
        Z=np.asarray(Z, dtype=np.float32),
        knn_idx=np.asarray(knn_idx, dtype=np.int32),
        knn_sim=np.asarray(knn_sim, dtype=np.float32),
        emb_dim_used=np.asarray([int(emb_dim_used)], dtype=np.int32),
    )
    _atomic_save_sparse_npz(wdiag_npz, W_diag0)
    _atomic_write_json(embed_npz.with_suffix(".meta.json"), {"fingerprint": fp})

def mncut_discretize(Vk, *, k, seed, cfg):
    hp = cfg["MNCUT"]
    Vk = np.asarray(Vk, dtype=np.float64)
    n, kk = Vk.shape
    if kk != int(k):
        raise ValueError("MNCut discretize expects Vk shape (n, k) where k == n_clusters")

    Vn = Vk / (np.linalg.norm(Vk, axis=1, keepdims=True) + 1e-12)

    rng = np.random.RandomState(int(seed))
    A = rng.normal(size=(int(k), int(k)))
    Q, _ = np.linalg.qr(A)
    R = Q

    labels_prev = None
    for _it in range(int(hp["MAX_ITER"])):
        Y = Vn @ R
        labels = np.argmax(Y, axis=1).astype(np.int32)

        counts = np.bincount(labels, minlength=int(k))
        empties = np.where(counts == 0)[0]
        if empties.size:
            part = np.partition(Y, -2, axis=1)
            margin = part[:, -1] - part[:, -2]
            order = np.argsort(margin)
            used = set()
            ptr = 0
            for ek in empties.tolist():
                while ptr < n and order[ptr] in used:
                    ptr += 1
                ridx = int(order[ptr]) if ptr < n else int(rng.randint(0, n))
                used.add(ridx)
                labels[ridx] = int(ek)

        if labels_prev is not None:
            change = float(np.mean(labels != labels_prev))
            if change <= float(hp["TOL"]):
                break
        labels_prev = labels.copy()

        E = np.zeros((n, int(k)), dtype=np.float64)
        E[np.arange(n), labels] = 1.0

        M = E.T @ Vn
        U, _, Vt = np.linalg.svd(M, full_matrices=False)
        R = Vt.T @ U.T

    return labels.astype(np.int32)

def _export_clusters_one_csv(
    out_dir: Path,
    *,
    verbs: list[str],
    V: int,
    active_ids: np.ndarray,
    seeds: list[int],
    labels_npz_fn,
    filename: str = "clusters.csv",
):
    out_dir = _ensure_dir(out_dir)
    clusters_csv = out_dir / filename

    df = pd.DataFrame({"verb": verbs})

    for i, seed in enumerate(seeds):
        p = labels_npz_fn(int(seed))
        if not p.exists():
            raise RuntimeError(f"Missing labels checkpoint for seed={seed}: {p}")

        z = np.load(p, allow_pickle=False)
        labels_active = z["labels"].astype(np.int32)

        full_labels = np.full(V, -1, dtype=np.int32)
        full_labels[active_ids] = labels_active

        df[f"cluster_{i}"] = full_labels

    _atomic_write_df_csv(df, clusters_csv)

def _assemble_matrix(feature_names: list[str], mats: dict[str, sp.csr_matrix]):
    blocks = [mats[f] for f in feature_names]
    return blocks[0] if len(blocks) == 1 else sp.hstack(blocks, format="csr")

def run_one_subset_resumable(
    *,
    out_dir: Path,
    verbs: list[str],
    V: int,
    X: sp.csr_matrix,
    feature_names: list[str],
    cfg: dict,
    knn: int,
    n_clusters: int,
    emb_dims: int,
    feats_tag_override: str | None = None,
    ui: ProgressUI | None = None
) -> dict:
    out_dir = _ensure_dir(out_dir)

    feats_tag = feats_tag_override or _feature_tag(feature_names)
    feats_tag = _safe_name(feats_tag)

    run_id = _safe_name(f"{cfg['IO']['RUN_STAMP']}_mncut_knn{knn}_k{n_clusters}_emb{emb_dims}_{feats_tag}")

    fp = _fingerprint(cfg, features=feature_names, knn=knn, k=n_clusters, emb_dims=emb_dims)

    embed_npz = out_dir / "_cache" / f"{run_id}__EMBED.npz"
    wdiag_npz = out_dir / "_cache" / f"{run_id}__Wdiag0.npz"
    state_json = out_dir / "_cache" / f"{run_id}__STATE.json"
    seed_metrics_csv = out_dir / "_cache" / f"{run_id}__seed_metrics.csv"
    done_json = out_dir / "_cache" / f"{run_id}__DONE.json"

    n_seeds = int(cfg["SEEDS"]["N_SEED_RUNS"])
    seeds = _make_seeds(int(cfg["SEEDS"]["BASE_SEED"]), n_seeds)

    labels_dir = out_dir / "_cache" / "_labels" / run_id
    labels_dir.mkdir(parents=True, exist_ok=True)

    def labels_npz(seed: int) -> Path:
        return labels_dir / f"labels_seed{seed}.npz"

    active_ids_fast = np.where(X.getnnz(axis=1) > 0)[0].astype(np.int32)

    if done_json.exists():
        saved = _read_json(done_json) or {}
        if saved.get("fingerprint") == fp:
            clusters_csv = out_dir / "clusters.csv"
            if not clusters_csv.exists():
                missing = [s for s in seeds if not labels_npz(int(s)).exists()]
                if missing:
                    raise RuntimeError(f"DONE.json exists but missing label checkpoints for seeds: {missing}")
                _export_clusters_one_csv(
                    out_dir,
                    verbs=verbs, V=V, active_ids=active_ids_fast,
                    seeds=[int(s) for s in seeds],
                    labels_npz_fn=labels_npz,
                    filename="clusters.csv",
                )

            df_seed = _safe_read_csv(seed_metrics_csv)
            if not df_seed.empty:
                return _summarize_from_seed_df(run_id, feature_names, knn, n_clusters, emb_dims, df_seed, fp, cfg)

    st = _read_json(state_json)
    if st and st.get("fingerprint") and st.get("fingerprint") != fp:
        raise RuntimeError(
            "Found existing checkpoints for this run_id, but config changed.\n"
            "Use a new OUT_DIR or change RUN_STAMP (or delete _cache for this run_id)."
        )

    if ui:
        ui.start_run(2 + n_seeds, f"Run: {run_id}")

    cached = _load_embed_cache(embed_npz, wdiag_npz, fp)
    if cached is not None:
        active_ids, Z, knn_idx, knn_sim, W_diag0, emb_dim_used = cached
        if ui: ui.step_run(1, "Loaded cached embedding.")
    else:
        active_ids, Z, knn_idx, knn_sim, W_diag0, emb_dim_used = build_graph_and_embedding(
            X, cfg=cfg, knn=knn, emb_dims=emb_dims, ui=ui
        )
        _save_embed_cache(embed_npz, wdiag_npz, fp=fp,
                          active_ids=active_ids, Z=Z,
                          knn_idx=knn_idx, knn_sim=knn_sim,
                          emb_dim_used=emb_dim_used, W_diag0=W_diag0)
        if ui: ui.step_run(1, "Saved embedding cache.")

    n = Z.shape[0]
    k_used = int(min(int(n_clusters), n - 1))
    if k_used < 2:
        raise RuntimeError("k_used<2")
    if k_used > Z.shape[1]:
        raise RuntimeError(f"MNCut needs emb_dims >= k_used (got emb={Z.shape[1]}, k_used={k_used})")

    _atomic_write_json(state_json, {
        "run_id": run_id,
        "fingerprint": fp,
        "features": list(feature_names),
        "knn": int(knn),
        "k": int(n_clusters),
        "k_used": int(k_used),
        "emb_dims": int(emb_dims),
        "emb_dim_used": int(emb_dim_used),
        "n_seeds": int(n_seeds),
        "seeds": list(seeds),
        "updated": _now_str(),
    })

    df_seed = _safe_read_csv(seed_metrics_csv)
    done_seeds = set(df_seed["seed"].astype(int).tolist()) if ("seed" in df_seed.columns and not df_seed.empty) else set()

    labels_list = []
    for s in seeds:
        p = labels_npz(int(s))
        if p.exists():
            try:
                z = np.load(p, allow_pickle=False)
                labels_list.append(z["labels"].astype(np.int32))
            except Exception:
                pass

    done_seeds = {int(s) for s in done_seeds if labels_npz(int(s)).exists()}
    remaining = [s for s in seeds if int(s) not in done_seeds]

    if ui: ui.step_run(0, f"Seeds remaining: {len(remaining)}/{len(seeds)}")

    Zk = normalize(Z[:, :k_used], axis=1)  # compute once

    for si, seed in enumerate(remaining, start=1):
        if ui: ui.step_run(0, f"MNCut seed {si}/{len(remaining)}...")

        labels = mncut_discretize(Zk, k=k_used, seed=int(seed), cfg=cfg).astype(np.int32)

        _atomic_save_npz_array(labels_npz(int(seed)), labels=labels)

        purity = _neighbor_purity_from_knn(knn_idx, knn_sim, labels)
        Q = _modularity_Q(W_diag0, labels)
        ncut = _multiway_ncut(W_diag0, labels)
        row = {
            "run_id": run_id,
            "seed": int(seed),
            "active_verbs": int(n),
            "k_used": int(k_used),
            "emb_dim_used": int(emb_dim_used),
            "neighbor_purity": float(purity),
            "modularity_Q": float(Q),
            "ncut": float(ncut),
            "score_purity_plus_Q": float(purity + Q),
        }
        df_seed = pd.concat([df_seed, pd.DataFrame([row])], ignore_index=True)
        df_seed = df_seed.sort_values(["seed"]).reset_index(drop=True)
        _atomic_write_df_csv(df_seed, seed_metrics_csv)

        labels_list.append(labels.copy())
        if ui: ui.step_run(1)

    nmi_mean, nmi_std = _nmi_stability(labels_list, cfg)

    _atomic_write_json(done_json, {"done": True, "when": _now_str(), "fingerprint": fp})

    _export_clusters_one_csv(
        out_dir,
        verbs=verbs,
        V=V,
        active_ids=active_ids,
        seeds=[int(s) for s in seeds],
        labels_npz_fn=labels_npz,
        filename="clusters.csv",
    )

    return _summarize_from_seed_df(run_id, feature_names, knn, n_clusters, emb_dims, df_seed, fp, cfg, nmi_mean, nmi_std)

def _neighbor_purity_from_knn(knn_idx, knn_sim, labels):
    n = labels.shape[0]
    pur = np.zeros(n, dtype=np.float64)
    for i in range(n):
        inds = knn_idx[i]
        sims = knn_sim[i]
        m = inds != i
        inds = inds[m]; sims = sims[m]
        if inds.size == 0:
            pur[i] = np.nan
            continue
        sims = np.maximum(sims, 0.0)
        tot = float(sims.sum())
        if tot <= 0:
            pur[i] = np.nan
            continue
        inside = float(sims[labels[inds] == labels[i]].sum())
        pur[i] = inside / tot
    return float(np.nanmean(pur))

def _modularity_Q(W_diag0, labels):
    deg = np.asarray(W_diag0.sum(axis=1)).ravel().astype(np.float64)
    m = float(W_diag0.sum() / 2.0)
    if m <= 0:
        return np.nan
    two_m = 2.0 * m
    Q = 0.0
    for c in np.unique(labels):
        mask = (labels == c)
        if mask.sum() == 0:
            continue
        vol_c = float(deg[mask].sum())
        internal_twice = float(W_diag0[mask][:, mask].sum())
        Q += (internal_twice / two_m) - (vol_c / two_m) ** 2
    return float(Q)

def _multiway_ncut(W_diag0, labels):
    deg = np.asarray(W_diag0.sum(axis=1)).ravel().astype(np.float64)
    ncut = 0.0
    for c in np.unique(labels):
        mask = (labels == c)
        if not np.any(mask):
            continue
        vol = float(deg[mask].sum())
        if vol <= 0:
            continue
        internal_twice = float(W_diag0[mask][:, mask].sum())
        cut = vol - internal_twice
        ncut += cut / vol
    return float(ncut)

def _nmi_stability(labels_list, cfg):
    S = len(labels_list)
    if S < 2:
        return (np.nan, np.nan)

    max_pairs = cfg["METRICS"]["NMI_MAX_PAIRS"]
    pairs = [(i, j) for i in range(S) for j in range(i+1, S)]

    if max_pairs is not None and len(pairs) > int(max_pairs):
        rng = np.random.RandomState(int(cfg["SEEDS"]["BASE_SEED"]) + 999)
        idx = rng.choice(len(pairs), size=int(max_pairs), replace=False)
        pairs = [pairs[t] for t in idx.tolist()]

    vals = []
    for i, j in pairs:
        vals.append(normalized_mutual_info_score(labels_list[i], labels_list[j], average_method="arithmetic"))

    vals = np.array(vals, dtype=np.float64)
    return float(np.mean(vals)), float(np.std(vals))

def _summarize_from_seed_df(run_id, feature_names, knn, n_clusters, emb_dims, df_seed, fp, cfg, nmi_mean=None, nmi_std=None):
    def _mean(x): return float(np.nanmean(x))
    def _std(x): return float(np.nanstd(x))

    if nmi_mean is None or nmi_std is None:
        nmi_mean = np.nan
        nmi_std = np.nan

    row = {
        "run_id": str(run_id),
        "features": " | ".join(feature_names),
        "n_features": int(len(feature_names)),
        "knn": int(knn),
        "k": int(n_clusters),
        "emb_dims": int(emb_dims),
        "active_verbs": int(df_seed["active_verbs"].iloc[0]) if (not df_seed.empty and "active_verbs" in df_seed.columns) else np.nan,
        "n_seeds": int(len(df_seed)),

        "neighbor_purity_mean": _mean(df_seed["neighbor_purity"]),
        "neighbor_purity_std": _std(df_seed["neighbor_purity"]),
        "modularity_Q_mean": _mean(df_seed["modularity_Q"]),
        "modularity_Q_std": _std(df_seed["modularity_Q"]),
        "ncut_mean": _mean(df_seed["ncut"]),
        "ncut_std": _std(df_seed["ncut"]),
        "score_mean": _mean(df_seed["score_purity_plus_Q"]),
        "score_std": _std(df_seed["score_purity_plus_Q"]),

        "nmi_stability_mean": float(nmi_mean),
        "nmi_stability_std": float(nmi_std),

        "fingerprint": str(fp),
        "run_stamp": str(cfg["IO"]["RUN_STAMP"]),
        "updated": _now_str(),
    }
    return row

def make_config(
    *,
    base_dir: str,
    out_dir: str,
    run_stamp: str | None = None,
    n_seed_runs: int = 10,
    base_seed: int = 0,
    emb_random_state: int = 0,
    use_col: str = "total_occurrences",
    min_count: float = 0.0,
    chunk: int = 1_000_000,
    verbverb_mode: str = "both",
    weight_transform: str = "raw",
    graph: dict | None = None,
    mncut_hp: dict | None = None,
    nmi_max_pairs: int | None = None,
    ui: dict | None = None,
):
    _graph = dict(
        KNN_METRIC="cosine",
        N_JOBS=-1,
        SIM_MIN=0.0,
        MAKE_SYMMETRIC=True,
        DIAG_EPS=1e-6,
        MIN_ACTIVE_VERBS=5,
    )
    if graph:
        _graph.update(graph)

    _mncut = dict(MAX_ITER=50, TOL=1e-6)
    if mncut_hp:
        _mncut.update(mncut_hp)

    if ui is None:
        ui = {"ENABLE": True, "SHOW_LOG": False}

    cfg = {
        "UI": ui,
        "IO": {
            "OUT_DIR": out_dir,
            "RUN_STAMP": run_stamp,
        },
        "DATA": {
            "VERB_LIST_CSV": str(Path(base_dir) / "Wiktionary_Thai_verb_26122025.csv"),
            "USE_COL": use_col,
            "MIN_COUNT": float(min_count),
            "CHUNK": int(chunk),
            "VERBVERB_MODE": verbverb_mode,
            "WEIGHT_TRANSFORM": weight_transform,
        },
        "SEEDS": {
            "BASE_SEED": int(base_seed),
            "N_SEED_RUNS": int(n_seed_runs),
            "EMB_RANDOM_STATE": int(emb_random_state),
        },
        "GRAPH": _graph,
        "MNCUT": _mncut,
        "METRICS": {"NMI_MAX_PAIRS": nmi_max_pairs},
    }
    return cfg

def load_selected_feature_mats(*, base_dir: str, vid: dict, V: int, cfg: dict,
                               feature_on: dict[str, bool]) -> dict[str, sp.csr_matrix]:
    specs = default_feature_specs(base_dir)
    chosen = [k for k, v in feature_on.items() if bool(v)]
    chosen = [k for k in chosen if k in specs]
    if not chosen:
        raise RuntimeError("No features selected. Set at least one FEATURE_ON[name]=True")

    mats = {}
    for f in chosen:
        spec = specs[f]
        path = Path(spec["path"])
        if not path.exists():
            raise FileNotFoundError(f"Missing feature file: {path}")

        kind = spec["kind"]
        if kind == "verbverb":
            A = read_verb_verb_matrix(
                path,
                vid=vid, V=V,
                prefix_col=spec.get("prefix_col", "prefix"),
                suffix_col=spec.get("suffix_col", "suffix"),
                use_col=cfg["DATA"]["USE_COL"],
                min_count=cfg["DATA"]["MIN_COUNT"],
                chunk=cfg["DATA"]["CHUNK"],
                weight_transform=cfg["DATA"]["WEIGHT_TRANSFORM"],
            )
            X = apply_verbverb_mode(A, cfg["DATA"]["VERBVERB_MODE"])
        elif kind == "verbctx":
            X = read_verb_context_matrix(
                path,
                vid=vid, V=V,
                verb_col=spec["verb_col"],
                ctx_col=spec["ctx_col"],
                use_col=cfg["DATA"]["USE_COL"],
                min_count=cfg["DATA"]["MIN_COUNT"],
                chunk=cfg["DATA"]["CHUNK"],
                weight_transform=cfg["DATA"]["WEIGHT_TRANSFORM"],
            )
        else:
            raise ValueError(f"Unknown feature kind: {kind}")

        X = X.tocsr()
        X.sum_duplicates()
        mats[f] = X

    return mats

def run_experiment(
    *,
    base_dir: str,
    out_dir: str,
    feature_on: dict[str, bool],
    knn: int,
    n_clusters: int,
    emb_dims: int,
    cfg: dict
) -> pd.DataFrame:
    ui_cfg = cfg.get("UI", {})
    ui = ProgressUI(enabled=bool(ui_cfg.get("ENABLE", True)), show_log=bool(ui_cfg.get("SHOW_LOG", False)))

    out_path = _ensure_dir(Path(out_dir))

    stamp_file = out_path / ".RUN_STAMP.txt"
    if cfg["IO"]["RUN_STAMP"]:
        run_stamp = str(cfg["IO"]["RUN_STAMP"])
        if not stamp_file.exists():
            _atomic_write_text(stamp_file, run_stamp)
    else:
        if stamp_file.exists():
            run_stamp = stamp_file.read_text(encoding="utf-8").strip()
        else:
            run_stamp = _now_str()
            _atomic_write_text(stamp_file, run_stamp)
    cfg["IO"]["RUN_STAMP"] = run_stamp

    ui.set_status("Loading verbs...")
    verbs, vid = load_verbs(Path(cfg["DATA"]["VERB_LIST_CSV"]))
    V = len(verbs)
    print("Loaded verbs:", V)

    ui.set_status("Loading selected feature matrices...")
    mats = load_selected_feature_mats(base_dir=base_dir, vid=vid, V=V, cfg=cfg, feature_on=feature_on)
    feature_names = list(mats.keys())
    print("Selected features:", feature_names)

    ui.set_status("Assembling matrix...")
    X = _assemble_matrix(feature_names, mats)

    ui.start_overall(1, "Running MNCut (resumable)...")

    all_features = list(default_feature_specs(base_dir).keys())
    feats_tag_override = _feature_tag(feature_names, all_feature_names=all_features)

    summary = run_one_subset_resumable(
        out_dir=out_path,
        verbs=verbs,
        V=V,
        X=X,
        feature_names=feature_names,
        cfg=cfg,
        knn=int(knn),
        n_clusters=int(n_clusters),
        emb_dims=int(emb_dims),
        feats_tag_override=feats_tag_override,
        ui=ui
    )

    results_csv = out_path / "master_results.csv"
    df_existing = _safe_read_csv(results_csv)
    if not df_existing.empty and "run_id" in df_existing.columns:
        df_existing = df_existing[df_existing["run_id"].astype(str) != str(summary["run_id"])].copy()
        df_out = pd.concat([df_existing, pd.DataFrame([summary])], ignore_index=True)
    else:
        df_out = pd.DataFrame([summary])

    _atomic_write_df_csv(df_out, results_csv)

    ui.step_overall(1, "Finished.")
    ui.mark_done(f"DONE. Exported: {results_csv} and {out_path / 'clusters.csv'}")
    return df_out

from google.colab import drive
drive.mount("/content/drive")

BASE_DIR = "/content/drive/MyDrive/Colab_Datasets/VV"
OUT_DIR  = f"{BASE_DIR}/VV_clusters" ###

KNN = 10
N_CLUSTERS = 100
EMB_DIMS = 200

N_SEED_RUNS = 100
BASE_SEED = 0

FEATURE_ON = {
    "verb-verb": True, ###
    "verb-(noun)-verb": False, ###
    "verb-(verb)-verb": False, ###
    "verb-(adjective)-verb": False, ###
    "verb-(adverb)-verb": False, ###
    "verb-(pronoun)-verb": False, ###
    "noun-verb": False, ###
    "adjective-verb": False, ###
    "adverb-verb": False, ###
    "pronoun-verb": False, ###
    "verb-noun": False, ###
    "verb-adjective": False, ###
    "verb-adverb": False, ###
    "verb-pronoun": False, ###
}

MNCUT_HP = {"MAX_ITER": 50, "TOL": 1e-6}
UI = {"ENABLE": True, "SHOW_LOG": False}

cfg = make_config(
    base_dir=BASE_DIR,
    out_dir=OUT_DIR,
    run_stamp=None,
    n_seed_runs=N_SEED_RUNS,
    base_seed=BASE_SEED,
    emb_random_state=0,
    min_count=0.0,
    chunk=1_000_000,
    verbverb_mode="both", ###
    weight_transform="raw",
    mncut_hp=MNCUT_HP,
    nmi_max_pairs=None,
    ui=UI,
)

df_results = run_experiment(
    base_dir=BASE_DIR,
    out_dir=OUT_DIR,
    feature_on=FEATURE_ON,
    knn=KNN,
    n_clusters=N_CLUSTERS,
    emb_dims=EMB_DIMS,
    cfg=cfg
)

display(df_results)

**Wisesight sentiment analysis downstream task**

In [None]:
!pip -q install pandas numpy scikit-learn tqdm tensorflow requests

import os
import re
import json
import pickle
import random
import hashlib
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

import tensorflow as tf
from tensorflow.keras import layers

from google.colab import drive
drive.mount("/content/drive")

BASE_DIR = "/content/drive/MyDrive/Colab_Datasets/VV"
VV_DIR = Path(BASE_DIR)

SENT_DIR = VV_DIR / "sentiment_eval"
SENT_DIR.mkdir(parents=True, exist_ok=True)

CLUSTER_DIR = VV_DIR / "VV_clusters" ###
CLUSTERS_CSV = CLUSTER_DIR / "clusters.csv"

VERB_CSV    = VV_DIR / "Wiktionary_Thai_verb_26122025.csv"
NOUN_CSV    = VV_DIR / "Wiktionary_Thai_noun_27122025.csv"
PRONOUN_CSV = VV_DIR / "Wiktionary_Thai_pronoun_02012026.csv"
ADJ_CSV     = VV_DIR / "Wiktionary_Thai_adjective_02012026.csv"
ADV_CSV     = VV_DIR / "Wiktionary_Thai_adverb_02012026.csv"

for p in [VERB_CSV, NOUN_CSV, PRONOUN_CSV, ADJ_CSV, ADV_CSV]:
    if not p.exists():
        raise FileNotFoundError(f"Missing dictionary file: {p}")

def file_md5_8(p: Path) -> str:
    h = hashlib.md5()
    with p.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()[:8]

EXP_PREFIX = "VV" ###

RUN_BASELINES = ["b0","b1","b2","b3","b4"]

RUN_TESTS = ["t1", "t2", "t3", "t4"]

RUN_BASELINES = [x.strip().lower() for x in RUN_BASELINES]
RUN_TESTS     = [x.strip().lower() for x in RUN_TESTS]
RUN_BASELINES = [x for x in RUN_BASELINES if x in {"b0","b1","b2","b3","b4"}]
RUN_TESTS     = [x for x in RUN_TESTS if x in {"t1","t2","t3","t4"}]

USE_QUESTION_CLASS = True

TEST_SIZE = 0.10
VAL_SIZE  = 0.10

MAX_SAMPLES = None
BATCH_SIZE = 1024
EPOCHS = 10
PATIENCE = 1
EMBED_DIM = 256
LSTM_UNITS = 256
DROPOUT = 0.25

K_RANDOM = 100

N_SEEDS = 100
BASE_SEED_FOR_LIST = 20000

BASELINE_PREFIX = "baseline"
BASELINE_RUNS_CSV   = SENT_DIR / f"{BASELINE_PREFIX}__runs.csv"
BASELINE_SUMMARY_CSV= SENT_DIR / f"{BASELINE_PREFIX}__summary.csv"

EXP_RUNS_CSV        = SENT_DIR / f"{EXP_PREFIX}__runs.csv"

PARSE_CACHE = SENT_DIR / "_cache_greedy_parse.pkl"
META_CACHE  = SENT_DIR / "_cache_greedy_parse.meta.json"

WISESIGHT_DIR = SENT_DIR / "_wisesight_download"
WISESIGHT_DIR.mkdir(parents=True, exist_ok=True)

WISESIGHT_URLS = {
    "pos": "https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/pos.txt",
    "neu": "https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/neu.txt",
    "neg": "https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/neg.txt",
    "q":   "https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/q.txt",
}

def download_file(url: str, path: Path):
    import requests
    if path.exists() and path.stat().st_size > 0:
        return
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    path.write_bytes(r.content)

for lab, url in WISESIGHT_URLS.items():
    if lab == "q" and not USE_QUESTION_CLASS:
        continue
    download_file(url, WISESIGHT_DIR / f"{lab}.txt")

def read_lines(p: Path):
    out = []
    with p.open("r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            s = line.strip()
            if s:
                out.append(s)
    return out

labels_to_load = ["pos", "neu", "neg"] + (["q"] if USE_QUESTION_CLASS else [])
data = []
for lab in labels_to_load:
    for s in read_lines(WISESIGHT_DIR / f"{lab}.txt"):
        data.append((s, lab))

df = pd.DataFrame(data, columns=["text", "label"])
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)
if MAX_SAMPLES is not None:
    df = df.iloc[:int(MAX_SAMPLES)].copy()

label2id = {lab: i for i, lab in enumerate(sorted(df["label"].unique().tolist()))}
df["y"] = df["label"].map(label2id).astype(int)

print("WiseSight size:", len(df))
print("Labels:", label2id)
print(df["label"].value_counts())

def load_dict_words(csv_path: Path) -> list[str]:
    s = pd.read_csv(csv_path, header=None, encoding="utf-8-sig")[0].astype(str)
    s = s.str.replace("\ufeff", "", regex=False).str.strip()
    s = s[s != ""]
    return s.tolist()

verbs    = load_dict_words(VERB_CSV)
nouns    = load_dict_words(NOUN_CSV)
pronouns = load_dict_words(PRONOUN_CSV)
adjs     = load_dict_words(ADJ_CSV)
advs     = load_dict_words(ADV_CSV)

verb_set = set(verbs)
all_words = set(verbs) | set(nouns) | set(pronouns) | set(adjs) | set(advs)

print("Dict sizes:",
      "verbs", len(verbs),
      "nouns", len(nouns),
      "pronouns", len(pronouns),
      "adjs", len(adjs),
      "advs", len(advs),
      "ALL", len(all_words))

class TrieNode:
    __slots__ = ("ch", "end")
    def __init__(self):
        self.ch = {}
        self.end = False

def build_trie(words):
    root = TrieNode()
    for w in words:
        node = root
        for c in w:
            node = node.ch.setdefault(c, TrieNode())
        node.end = True
    return root

TRIE = build_trie(all_words)

def greedy_tokenize(text: str, trie_root: TrieNode) -> list[str]:
    s = re.sub(r"\s+", "", str(text))
    tokens = []
    i = 0
    unknown_run = False

    while i < len(s):
        node = trie_root
        j = i
        last_end = -1

        while j < len(s) and s[j] in node.ch:
            node = node.ch[s[j]]
            j += 1
            if node.end:
                last_end = j

        if last_end != -1:
            if unknown_run:
                tokens.append("<OTHER>")
                unknown_run = False
            tokens.append(s[i:last_end])
            i = last_end
        else:
            unknown_run = True
            i += 1

    if unknown_run:
        tokens.append("<OTHER>")

    return tokens

def make_parse_meta():
    return {
        "dataset_size": int(len(df)),
        "labels": sorted(df["label"].unique().tolist()),
        "dict_mtime": {
            "verb": VERB_CSV.stat().st_mtime,
            "noun": NOUN_CSV.stat().st_mtime,
            "pronoun": PRONOUN_CSV.stat().st_mtime,
            "adjective": ADJ_CSV.stat().st_mtime,
            "adverb": ADV_CSV.stat().st_mtime,
        },
        "dict_sizes": {
            "verbs": int(len(verbs)),
            "nouns": int(len(nouns)),
            "pronouns": int(len(pronouns)),
            "adjs": int(len(adjs)),
            "advs": int(len(advs)),
            "all": int(len(all_words)),
        },
        "tokenizer": "greedy_trie_longest_match__collapse_unknown_to_OTHER",
        "use_question_class": bool(USE_QUESTION_CLASS),
        "max_samples": int(MAX_SAMPLES) if MAX_SAMPLES is not None else None,
    }

def parse_cache_valid() -> bool:
    if not (PARSE_CACHE.exists() and META_CACHE.exists()):
        return False
    try:
        old = json.loads(META_CACHE.read_text(encoding="utf-8"))
        new = make_parse_meta()
        keys = ["dataset_size", "labels", "dict_mtime", "dict_sizes", "tokenizer", "use_question_class", "max_samples"]
        return all(old.get(k) == new.get(k) for k in keys)
    except Exception:
        return False

print("Greedy parsing all texts (resumable)...")
if parse_cache_valid():
    with PARSE_CACHE.open("rb") as f:
        cached = pickle.load(f)
    parsed_tokens = cached["parsed_tokens"]
    parsed_isverb = cached["parsed_isverb"]
    print("Loaded parse cache:", PARSE_CACHE)
else:
    parsed_tokens = []
    parsed_isverb = []
    for t in tqdm(df["text"].tolist(), desc="Tokenizing"):
        toks = greedy_tokenize(t, TRIE)
        parsed_tokens.append(toks)
        parsed_isverb.append([tok in verb_set for tok in toks])

    with PARSE_CACHE.open("wb") as f:
        pickle.dump({"parsed_tokens": parsed_tokens, "parsed_isverb": parsed_isverb},
                    f, protocol=pickle.HIGHEST_PROTOCOL)
    META_CACHE.write_text(json.dumps(make_parse_meta(), ensure_ascii=False, indent=2), encoding="utf-8")
    print("Saved parse cache:", PARSE_CACHE)

def stable_hash(obj: dict) -> str:
    blob = json.dumps(obj, sort_keys=True, ensure_ascii=False).encode("utf-8")
    return hashlib.md5(blob).hexdigest()[:10]

SETUP_TAG = stable_hash({
    "use_question_class": USE_QUESTION_CLASS,
    "test_size": TEST_SIZE,
    "val_size": VAL_SIZE,
    "max_samples": MAX_SAMPLES,
    "batch_size": BATCH_SIZE,
    "epochs": EPOCHS,
    "patience": PATIENCE,
    "embed_dim": EMBED_DIM,
    "lstm_units": LSTM_UNITS,
    "dropout": DROPOUT,
    "k_random": K_RANDOM,
    "n_seeds": N_SEEDS,
    "labels": sorted(label2id.keys()),
    "parse_meta": make_parse_meta(),
})
print("SETUP_TAG:", SETUP_TAG)

RUN_TESTS_SET = set(RUN_TESTS)
RUN_BASE_SET  = set(RUN_BASELINES)

CLUSTER_TAG = "NOCLUSTERS"
actual_maps = []

if RUN_TESTS_SET:
    if not CLUSTERS_CSV.exists():
        raise FileNotFoundError(f"CLUSTERS_CSV not found but tests requested: {CLUSTERS_CSV}")
    CLUSTER_TAG = f"{CLUSTERS_CSV.stem}_{file_md5_8(CLUSTERS_CSV)}"

    clusters_df = pd.read_csv(CLUSTERS_CSV, encoding="utf-8-sig")
    if "verb" not in clusters_df.columns:
        raise RuntimeError("clusters CSV must contain a 'verb' column.")

    cluster_cols = [c for c in clusters_df.columns if c.startswith("cluster_")]
    cluster_cols = sorted(cluster_cols, key=lambda x: int(x.split("_")[1]))

    if len(cluster_cols) < N_SEEDS:
        raise RuntimeError(f"clusters CSV has {len(cluster_cols)} cluster columns, need at least {N_SEEDS}.")

    cluster_cols = cluster_cols[:N_SEEDS]
    for col in cluster_cols:
        m = dict(zip(
            clusters_df["verb"].astype(str).tolist(),
            clusters_df[col].astype(int).tolist()
        ))
        actual_maps.append(m)

    print("Loaded cluster columns:", cluster_cols[:5], "...", cluster_cols[-1])

print("CLUSTER_TAG:", CLUSTER_TAG)

def make_random_cluster_map(seed: int, verb_list: list[str], k: int) -> dict[str, int]:
    rng = np.random.RandomState(seed)
    return {v: int(rng.randint(0, k)) for v in verb_list}

random_maps = [make_random_cluster_map(10_000 + i, verbs, K_RANDOM) for i in range(N_SEEDS)]

def vclass_token(c: int | None) -> str:
    if c is None or int(c) < 0:
        return "<VCLASS_OOV>"
    return f"<VCLASS_{int(c)}>"

def build_repr(tokens: list[str], isverb: list[bool], vmap: dict[str, int] | None, mode: str) -> list[str]:
    if mode == "baseline0_words":
        return tokens

    if vmap is None:
        raise ValueError("vmap required for this mode")

    if mode == "other_verb_vclass":
        out = []
        for tok, vb in zip(tokens, isverb):
            out.append(tok)
            if vb:
                out.append(vclass_token(vmap.get(tok)))
        return out

    if mode == "other_vclass":
        out = []
        for tok, vb in zip(tokens, isverb):
            if vb:
                out.append(vclass_token(vmap.get(tok)))
            else:
                out.append(tok)
        return out

    if mode == "verb_vclass":
        out = []
        for tok, vb in zip(tokens, isverb):
            if vb:
                out.append(tok)
                out.append(vclass_token(vmap.get(tok)))
        return out if out else ["<NO_VERB>"]

    if mode == "vclass_only":
        out = []
        for tok, vb in zip(tokens, isverb):
            if vb:
                out.append(vclass_token(vmap.get(tok)))
        return out if out else ["<NO_VERB>"]

    raise ValueError(f"Unknown mode: {mode}")

PAD = 0
UNK = 1

def build_vocab(token_lists: list[list[str]], min_freq: int = 1) -> dict[str, int]:
    from collections import Counter
    c = Counter()
    for toks in token_lists:
        c.update(toks)
    vocab = {"<PAD>": PAD, "<UNK>": UNK}
    for tok, cnt in c.items():
        if cnt >= min_freq and tok not in vocab:
            vocab[tok] = len(vocab)
    return vocab

def encode(token_lists: list[list[str]], vocab: dict[str, int], max_len: int):
    X = np.full((len(token_lists), max_len), PAD, dtype=np.int32)
    for i, toks in enumerate(token_lists):
        ids = [vocab.get(t, UNK) for t in toks][:max_len]
        X[i, :len(ids)] = ids
    return X

def make_model(vocab_size: int, n_classes: int, max_len: int,
               embed_dim: int, lstm_units: int, dropout: float):
    inp = layers.Input(shape=(max_len,), dtype="int32")
    x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(inp)
    x = layers.Bidirectional(layers.LSTM(lstm_units, return_sequences=False))(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    out = layers.Dense(n_classes, activation="softmax")(x)
    model = tf.keras.Model(inp, out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

def run_one_experiment(mode: str, vmap: dict[str, int] | None, seed: int):
    tf.keras.backend.clear_session()
    tf.keras.utils.set_random_seed(int(seed))
    np.random.seed(int(seed))
    random.seed(int(seed))

    token_lists = [build_repr(toks, vb, vmap, mode) for toks, vb in zip(parsed_tokens, parsed_isverb)]

    y = df["y"].to_numpy()
    idx = np.arange(len(y))

    idx_train, idx_test = train_test_split(
        idx, test_size=TEST_SIZE, random_state=int(seed), stratify=y
    )

    y_train_full = y[idx_train]
    idx_tr, idx_val = train_test_split(
        idx_train, test_size=VAL_SIZE, random_state=int(seed), stratify=y_train_full
    )

    tr_tokens = [token_lists[i] for i in idx_tr]
    val_tokens = [token_lists[i] for i in idx_val]
    te_tokens = [token_lists[i] for i in idx_test]

    vocab = build_vocab(tr_tokens, min_freq=1)

    lengths = [len(t) for t in tr_tokens]
    max_len = int(np.clip(np.percentile(lengths, 95), 8, 128))

    X_tr  = encode(tr_tokens, vocab, max_len)
    X_val = encode(val_tokens, vocab, max_len)
    X_te  = encode(te_tokens, vocab, max_len)

    y_tr  = y[idx_tr]
    y_val = y[idx_val]
    y_te  = y[idx_test]

    model = make_model(
        vocab_size=len(vocab),
        n_classes=len(label2id),
        max_len=max_len,
        embed_dim=EMBED_DIM,
        lstm_units=LSTM_UNITS,
        dropout=DROPOUT
    )

    cb = [tf.keras.callbacks.EarlyStopping(
        monitor="val_accuracy", patience=PATIENCE, restore_best_weights=True
    )]

    model.fit(
        X_tr, y_tr,
        validation_data=(X_val, y_val),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        verbose=0,
        callbacks=cb,
        shuffle=False,
    )

    yhat = np.argmax(model.predict(X_te, batch_size=BATCH_SIZE, verbose=0), axis=1)
    acc = float(accuracy_score(y_te, yhat))
    f1m = float(f1_score(y_te, yhat, average="macro"))

    return {
        "vocab_size": int(len(vocab)),
        "max_len": int(max_len),
        "accuracy": acc,
        "macro_f1": f1m,
    }

def read_runs_csv(path: Path) -> pd.DataFrame:
    if path.exists():
        try:
            return pd.read_csv(path, encoding="utf-8-sig")
        except Exception:
            return pd.DataFrame()
    return pd.DataFrame()

def write_runs_csv(df_runs: pd.DataFrame, path: Path):
    tmp = path.with_suffix(path.suffix + f".tmp{os.getpid()}")
    df_runs.to_csv(tmp, index=False, encoding="utf-8-sig")
    os.replace(str(tmp), str(path))

def row_key(setup_tag: str, cluster_tag: str, cond: str, seed: int) -> str:
    return f"{setup_tag}||{cluster_tag}||{cond}||{int(seed)}"

TRAIN_SEEDS = [int(BASE_SEED_FOR_LIST + i) for i in range(N_SEEDS)]
SEED_SET = set(TRAIN_SEEDS)

TEST_TO_BASELINE = {"t1":"b1", "t2":"b2", "t3":"b3", "t4":"b4"}

def require_baselines_complete_for_tests():
    if not RUN_TESTS_SET:
        return
    need = sorted({TEST_TO_BASELINE[t] for t in RUN_TESTS_SET})
    base_df = read_runs_csv(BASELINE_RUNS_CSV)
    if base_df.empty:
        raise RuntimeError(f"Tests requested {sorted(RUN_TESTS_SET)} but no baselines file found: {BASELINE_RUNS_CSV}")

    missing_report = []
    for b in need:
        sub = base_df[(base_df.get("setup_tag") == SETUP_TAG) & (base_df.get("cond") == b)]
        have = set(sub["seed"].astype(int).tolist()) if not sub.empty and "seed" in sub.columns else set()
        miss = sorted(SEED_SET - have)
        if miss:
            missing_report.append((b, len(miss), miss[:10]))

    if missing_report:
        msg = "Cannot run tests: required baselines are incomplete for this SETUP_TAG.\n"
        msg += f"SETUP_TAG={SETUP_TAG}\n"
        msg += "Missing seeds (showing first 10):\n"
        for b, nmiss, first10 in missing_report:
            msg += f"  {b}: missing {nmiss} seeds, e.g. {first10}\n"
        msg += "\nRun baselines first (at least b1..b4) until complete."
        raise RuntimeError(msg)

require_baselines_complete_for_tests()

COND_INFO = {
    "b0": ("baseline0_words", None),
    "b1": ("other_verb_vclass", "RAND"),
    "b2": ("other_vclass", "RAND"),
    "b3": ("verb_vclass", "RAND"),
    "b4": ("vclass_only", "RAND"),

    "t1": ("other_verb_vclass", "REAL"),
    "t2": ("other_vclass", "REAL"),
    "t3": ("verb_vclass", "REAL"),
    "t4": ("vclass_only", "REAL"),
}

def build_plan():
    plan = []
    for i, seed in enumerate(TRAIN_SEEDS):
        for cond in RUN_BASELINES:
            mode, kind = COND_INFO[cond]
            if cond == "b0":
                vmap = None
                ctag = "BASELINE"
            else:
                vmap = random_maps[i]
                ctag = "BASELINE"
            plan.append(("BASELINE", cond, mode, vmap, int(seed), str(ctag)))

        if RUN_TESTS_SET:
            for cond in RUN_TESTS:
                mode, kind = COND_INFO[cond]
                vmap = actual_maps[i]  # paired by i
                ctag = CLUSTER_TAG
                plan.append((EXP_PREFIX, cond, mode, vmap, int(seed), str(ctag)))
    return plan

plan = build_plan()
print("Planned runs:", len(plan))

base_df = read_runs_csv(BASELINE_RUNS_CSV)
exp_df  = read_runs_csv(EXP_RUNS_CSV)

done_base = set()
done_exp  = set()

for d, keyset in [(base_df, done_base), (exp_df, done_exp)]:
    if not d.empty and {"setup_tag","cluster_tag","cond","seed"}.issubset(d.columns):
        for st, ct, c, s in zip(d["setup_tag"], d["cluster_tag"], d["cond"], d["seed"]):
            keyset.add(row_key(str(st), str(ct), str(c), int(s)))

for exp_prefix, cond, mode, vmap, seed, ctag in tqdm(plan, desc="Training runs (resumable)"):
    key = row_key(SETUP_TAG, ctag, cond, seed)

    if exp_prefix == "BASELINE":
        if key in done_base:
            continue
    else:
        if key in done_exp:
            continue

    out = run_one_experiment(mode=mode, vmap=vmap, seed=int(seed))

    row = {
        "exp_prefix": exp_prefix,
        "setup_tag": SETUP_TAG,
        "cluster_tag": ctag,
        "cond": cond,
        "mode": mode,
        "seed": int(seed),
        "accuracy": float(out["accuracy"]),
        "macro_f1": float(out["macro_f1"]),
        "vocab_size": int(out["vocab_size"]),
        "max_len": int(out["max_len"]),
        "k_random": int(K_RANDOM),
    }

    if exp_prefix == "BASELINE":
        base_df = pd.concat([base_df, pd.DataFrame([row])], ignore_index=True)
        write_runs_csv(base_df, BASELINE_RUNS_CSV)
        done_base.add(key)
    else:
        exp_df = pd.concat([exp_df, pd.DataFrame([row])], ignore_index=True)
        write_runs_csv(exp_df, EXP_RUNS_CSV)
        done_exp.add(key)

    print(f"{exp_prefix:10s} {cond:2s} seed={seed}  acc={row['accuracy']:.4f}  f1={row['macro_f1']:.4f}  max_len={row['max_len']}  vocab={row['vocab_size']}")

print("Saved baseline runs to:", BASELINE_RUNS_CSV)
print("Saved exp runs to     :", EXP_RUNS_CSV)

def summarize_metrics(dfsub: pd.DataFrame):
    if dfsub.empty:
        return dict(n=0, acc_mean=np.nan, acc_std=np.nan, f1_mean=np.nan, f1_std=np.nan)
    return dict(
        n=int(len(dfsub)),
        acc_mean=float(dfsub["accuracy"].mean()),
        acc_std=float(dfsub["accuracy"].std(ddof=1)) if len(dfsub) > 1 else 0.0,
        f1_mean=float(dfsub["macro_f1"].mean()),
        f1_std=float(dfsub["macro_f1"].std(ddof=1)) if len(dfsub) > 1 else 0.0,
    )

def paired_delta_f1(b_df: pd.DataFrame, t_df: pd.DataFrame):
    if b_df.empty or t_df.empty:
        return dict(n_paired=0, delta_f1_mean=np.nan, delta_f1_std=np.nan, delta_f1_ci95_lo=np.nan, delta_f1_ci95_hi=np.nan)

    b = b_df[["seed","macro_f1"]].rename(columns={"macro_f1":"f1_b"}).copy()
    t = t_df[["seed","macro_f1"]].rename(columns={"macro_f1":"f1_t"}).copy()

    paired = b.merge(t, on="seed", how="inner")
    if paired.empty:
        return dict(n_paired=0, delta_f1_mean=np.nan, delta_f1_std=np.nan, delta_f1_ci95_lo=np.nan, delta_f1_ci95_hi=np.nan)

    d = (paired["f1_t"] - paired["f1_b"]).to_numpy(dtype=float)
    n = int(d.shape[0])
    mean = float(np.mean(d))
    std  = float(np.std(d, ddof=1)) if n > 1 else 0.0
    se   = std / np.sqrt(n) if n > 0 else np.nan
    ci_lo = mean - 1.96 * se if n > 1 else mean
    ci_hi = mean + 1.96 * se if n > 1 else mean
    return dict(n_paired=n, delta_f1_mean=mean, delta_f1_std=std, delta_f1_ci95_lo=float(ci_lo), delta_f1_ci95_hi=float(ci_hi))

base_cur = base_df[base_df.get("setup_tag") == SETUP_TAG].copy()
exp_cur  = exp_df[(exp_df.get("setup_tag") == SETUP_TAG) & (exp_df.get("cluster_tag") == CLUSTER_TAG)].copy()

base_rows = []
for cond in ["b0","b1","b2","b3","b4"]:
    sub = base_cur[base_cur.get("cond") == cond].copy()
    s = summarize_metrics(sub)
    base_rows.append({"cond": cond, **s})
baseline_summary = pd.DataFrame(base_rows)
write_runs_csv(baseline_summary, BASELINE_SUMMARY_CSV)

print("\n=== BASELINE SUMMARY (current SETUP_TAG) ===")
print(baseline_summary.to_string(index=False))
print("Saved:", BASELINE_SUMMARY_CSV)

summary_rows = []
pair_order = [("t1","b1"), ("t2","b2"), ("t3","b3"), ("t4","b4")]

for tcond, bcond in pair_order:
    b_sub = base_cur[base_cur.get("cond") == bcond].copy()
    t_sub = exp_cur[exp_cur.get("cond") == tcond].copy()

    sb = summarize_metrics(b_sub)
    st = summarize_metrics(t_sub)
    dd = paired_delta_f1(b_sub, t_sub)

    summary_rows.append({
        "pair": f"{tcond}-{bcond}",
        "baseline_cond": bcond,
        "test_cond": tcond,
        "baseline_n": sb["n"],
        "baseline_f1_mean": sb["f1_mean"],
        "baseline_f1_std": sb["f1_std"],
        "test_n": st["n"],
        "test_f1_mean": st["f1_mean"],
        "test_f1_std": st["f1_std"],
        "n_paired": dd["n_paired"],
        "delta_f1_mean": dd["delta_f1_mean"],
        "delta_f1_std": dd["delta_f1_std"],
        "delta_f1_ci95_lo": dd["delta_f1_ci95_lo"],
        "delta_f1_ci95_hi": dd["delta_f1_ci95_hi"],
        "setup_tag": SETUP_TAG,
        "cluster_tag": CLUSTER_TAG,
        "exp_prefix": EXP_PREFIX,
    })

exp_summary = pd.DataFrame(summary_rows)

EXP_SUMMARY_CSV = SENT_DIR / f"{EXP_PREFIX}__summary__{CLUSTER_TAG}__{SETUP_TAG}.csv"
write_runs_csv(exp_summary, EXP_SUMMARY_CSV)

print("\n=== EXP SUMMARY + PAIRED ΔF1 (test - baseline, same seed) ===")
print(exp_summary.to_string(index=False))
print("Saved:", EXP_SUMMARY_CSV)

example = "วันนี้จะไปเที่ยวน้าาา"
toks = greedy_tokenize(example, TRIE)
isv = [t in verb_set for t in toks]

print("\nExample text:", example)
print("Tokens:", " ".join(toks))
print("b0:", " ".join(build_repr(toks, isv, None, "baseline0_words")))

vmap_rand0 = random_maps[0]
print("b1:", " ".join(build_repr(toks, isv, vmap_rand0, "other_verb_vclass")))
print("b2:", " ".join(build_repr(toks, isv, vmap_rand0, "other_vclass")))
print("b3:", " ".join(build_repr(toks, isv, vmap_rand0, "verb_vclass")))
print("b4:", " ".join(build_repr(toks, isv, vmap_rand0, "vclass_only")))

if RUN_TESTS_SET and actual_maps:
    vmap_real0 = actual_maps[0]
    print("t1:", " ".join(build_repr(toks, isv, vmap_real0, "other_verb_vclass")))
    print("t2:", " ".join(build_repr(toks, isv, vmap_real0, "other_vclass")))
    print("t3:", " ".join(build_repr(toks, isv, vmap_real0, "verb_vclass")))
    print("t4:", " ".join(build_repr(toks, isv, vmap_real0, "vclass_only")))

**Centroid finder**

In [None]:
!pip -q install pandas numpy scipy scikit-learn tqdm pynndescent

import os, json, hashlib
from pathlib import Path

import numpy as np
import pandas as pd
import scipy.sparse as sp

from tqdm.auto import tqdm
from sklearn.preprocessing import normalize
from sklearn.manifold import SpectralEmbedding
from pynndescent import NNDescent

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

from google.colab import drive
drive.mount("/content/drive")

BASE_DIR = Path("/content/drive/MyDrive/Colab_Datasets/VV")

REL_CLUSTER_PATH = Path("../VV/clusters/VV_clusters.csv") ###
CLUSTERS_CSV = (BASE_DIR / REL_CLUSTER_PATH).resolve()

OUT_DIR = (BASE_DIR / "centroid_outputs_VV").resolve() ###
OUT_DIR.mkdir(parents=True, exist_ok=True)

SEED_COL = "cluster_0"

TOP_N = 10

KNN = 10
EMB_DIMS = 200
K_EXPECTED = 100

USE_COL = "total_occurrences"
MIN_COUNT = 0.0
CHUNK = 1_000_000
VERBVERB_MODE = "both" ###
WEIGHT_TRANSFORM = "raw"

FEATURE_ON = {
    "verb-verb": True, ###
    "verb-(noun)-verb": False, ###
    "verb-(verb)-verb": False, ###
    "verb-(adjective)-verb": False, ###
    "verb-(adverb)-verb": False, ###
    "verb-(pronoun)-verb": False, ###
    "noun-verb": False, ###
    "adjective-verb": False, ###
    "adverb-verb": False, ###
    "pronoun-verb": False, ###
    "verb-noun": False, ###
    "verb-adjective": False, ###
    "verb-adverb": False, ###
    "verb-pronoun": False, ###
}

if not CLUSTERS_CSV.exists():
    raise FileNotFoundError(f"Cluster CSV not found: {CLUSTERS_CSV}")

VERB_LIST_CSV = BASE_DIR / "Wiktionary_Thai_verb_26122025.csv"
if not VERB_LIST_CSV.exists():
    raise FileNotFoundError(f"Verb list CSV not found: {VERB_LIST_CSV}")

print("Using clusters file:", CLUSTERS_CSV)
print("Saving outputs to:", OUT_DIR)

embedding_cache_npz = OUT_DIR / "embedding_cache.npz"
embedding_cache_meta = OUT_DIR / "embedding_cache.meta.json"
centroids_npz = OUT_DIR / f"centroids__{SEED_COL}.npz"
centroids_summary_csv = OUT_DIR / f"centroids_summary__{SEED_COL}.csv"

def default_feature_specs(base_dir: Path):
    return {
        "verb-verb":             {"path": base_dir / "Thaisum_verb-verb_pairs.csv",             "kind": "verbverb"},
        "verb-(noun)-verb":      {"path": base_dir / "Thaisum_verb-(noun)-verb_pairs.csv",      "kind": "verbverb"},
        "verb-(verb)-verb":      {"path": base_dir / "Thaisum_verb-(verb)-verb_pairs.csv",      "kind": "verbverb"},
        "verb-(adjective)-verb": {"path": base_dir / "Thaisum_verb-(adjective)-verb_pairs.csv", "kind": "verbverb"},
        "verb-(adverb)-verb":    {"path": base_dir / "Thaisum_verb-(adverb)-verb_pairs.csv",    "kind": "verbverb"},
        "verb-(pronoun)-verb":   {"path": base_dir / "Thaisum_verb-(pronoun)-verb_pairs.csv",   "kind": "verbverb"},

        "noun-verb":             {"path": base_dir / "Thaisum_noun-verb_pairs.csv",             "kind": "verbctx", "verb_col": "verb", "ctx_col": "noun"},
        "adjective-verb":        {"path": base_dir / "Thaisum_adjective-verb_pairs.csv",        "kind": "verbctx", "verb_col": "verb", "ctx_col": "adjective"},
        "adverb-verb":           {"path": base_dir / "Thaisum_adverb-verb_pairs.csv",           "kind": "verbctx", "verb_col": "verb", "ctx_col": "adverb"},
        "pronoun-verb":          {"path": base_dir / "Thaisum_pronoun-verb_pairs.csv",          "kind": "verbctx", "verb_col": "verb", "ctx_col": "pronoun"},

        "verb-noun":             {"path": base_dir / "Thaisum_verb-noun_pairs.csv",             "kind": "verbctx", "verb_col": "verb", "ctx_col": "noun"},
        "verb-adjective":        {"path": base_dir / "Thaisum_verb-adjective_pairs.csv",        "kind": "verbctx", "verb_col": "verb", "ctx_col": "adjective"},
        "verb-adverb":           {"path": base_dir / "Thaisum_verb-adverb_pairs.csv",           "kind": "verbctx", "verb_col": "verb", "ctx_col": "adverb"},
        "verb-pronoun":          {"path": base_dir / "Thaisum_verb-pronoun_pairs.csv",          "kind": "verbctx", "verb_col": "verb", "ctx_col": "pronoun"},
    }

def load_verbs(verb_list_csv: Path):
    s = pd.read_csv(verb_list_csv, header=None, encoding="utf-8-sig")[0].astype(str)
    s = s.str.replace("\ufeff", "", regex=False).str.strip()
    verbs = s[s != ""].tolist()
    vid = {v: i for i, v in enumerate(verbs)}
    return verbs, vid

verbs, vid = load_verbs(VERB_LIST_CSV)
V = len(verbs)
print("Loaded verbs:", V)

def read_verb_verb_matrix(path: Path, *, vid: dict, V: int,
                          prefix_col: str, suffix_col: str,
                          use_col: str, min_count: float, chunk: int,
                          weight_transform: str):
    if not path.exists():
        raise FileNotFoundError(path)

    rows_parts, cols_parts, data_parts = [], [], []
    usecols = [prefix_col, suffix_col, use_col]

    for df in tqdm(pd.read_csv(path, usecols=usecols, chunksize=int(chunk), encoding="utf-8-sig"),
                   desc=f"Reading {path.name}"):
        df = df.dropna()
        if df.empty:
            continue
        df = df[df[use_col] >= min_count]
        if df.empty:
            continue

        r = df[prefix_col].astype(str).map(vid)
        c = df[suffix_col].astype(str).map(vid)
        m = r.notna() & c.notna()
        if not m.any():
            continue

        r = r[m].astype(np.int32).to_numpy()
        c = c[m].astype(np.int32).to_numpy()
        w = df.loc[m, use_col].to_numpy(np.float32)

        if weight_transform == "raw":
            pass
        elif weight_transform == "log1p":
            w = np.log1p(w).astype(np.float32)
        elif weight_transform == "sqrt":
            w = np.sqrt(w).astype(np.float32)
        else:
            raise ValueError(f"Unknown weight_transform: {weight_transform}")

        rows_parts.append(r); cols_parts.append(c); data_parts.append(w)

    if not rows_parts:
        raise RuntimeError(f"No usable edges in {path.name}.")

    rows = np.concatenate(rows_parts)
    cols = np.concatenate(cols_parts)
    data = np.concatenate(data_parts)

    A = sp.coo_matrix((data, (rows, cols)), shape=(V, V), dtype=np.float32).tocsr()
    A.sum_duplicates()
    A.setdiag(0)
    A.eliminate_zeros()
    return A

def read_verb_context_matrix(path: Path, *, vid: dict, V: int,
                            verb_col: str, ctx_col: str,
                            use_col: str, min_count: float, chunk: int,
                            weight_transform: str):
    if not path.exists():
        raise FileNotFoundError(path)

    ctx2id = {}
    rows_parts, cols_parts, data_parts = [], [], []
    usecols = [verb_col, ctx_col, use_col]

    for df in tqdm(pd.read_csv(path, usecols=usecols, chunksize=int(chunk), encoding="utf-8-sig"),
                   desc=f"Reading {path.name}"):
        df = df.dropna()
        if df.empty:
            continue
        df = df[df[use_col] >= min_count]
        if df.empty:
            continue

        v = df[verb_col].astype(str).map(vid)
        m = v.notna()
        if not m.any():
            continue

        v = v[m].astype(np.int32).to_numpy()
        ctx_words = df.loc[m, ctx_col].astype(str).to_numpy()
        w = df.loc[m, use_col].to_numpy(np.float32)

        if weight_transform == "raw":
            pass
        elif weight_transform == "log1p":
            w = np.log1p(w).astype(np.float32)
        elif weight_transform == "sqrt":
            w = np.sqrt(w).astype(np.float32)
        else:
            raise ValueError(f"Unknown weight_transform: {weight_transform}")

        c = np.empty_like(v, dtype=np.int32)
        for i, cw in enumerate(ctx_words):
            j = ctx2id.get(cw)
            if j is None:
                j = len(ctx2id)
                ctx2id[cw] = j
            c[i] = j

        rows_parts.append(v); cols_parts.append(c); data_parts.append(w)

    if not rows_parts:
        raise RuntimeError(f"No usable edges in {path.name}.")

    rows = np.concatenate(rows_parts)
    cols = np.concatenate(cols_parts)
    data = np.concatenate(data_parts)

    X = sp.coo_matrix((data, (rows, cols)), shape=(V, len(ctx2id)), dtype=np.float32).tocsr()
    X.sum_duplicates()
    return X

def apply_verbverb_mode(A: sp.csr_matrix, mode: str):
    if mode == "out":
        return A
    if mode == "in":
        return A.T.tocsr()
    if mode == "both":
        return sp.hstack([A, A.T], format="csr")
    raise ValueError("VERBVERB_MODE must be out/in/both")

def assemble_matrix(feature_names: list[str], mats: dict[str, sp.csr_matrix]):
    blocks = [mats[f] for f in feature_names]
    return blocks[0] if len(blocks) == 1 else sp.hstack(blocks, format="csr")

specs = default_feature_specs(BASE_DIR)
feature_names = [k for k, v in FEATURE_ON.items() if v]
if not feature_names:
    raise RuntimeError("No features selected in FEATURE_ON")

mats = {}
for f in feature_names:
    spec = specs[f]
    p = Path(spec["path"])
    if not p.exists():
        raise FileNotFoundError(f"Missing feature file: {p}")

    if spec["kind"] == "verbverb":
        A = read_verb_verb_matrix(
            p, vid=vid, V=V,
            prefix_col="prefix", suffix_col="suffix",
            use_col=USE_COL, min_count=MIN_COUNT, chunk=CHUNK,
            weight_transform=WEIGHT_TRANSFORM
        )
        Xf = apply_verbverb_mode(A, VERBVERB_MODE)
    else:
        Xf = read_verb_context_matrix(
            p, vid=vid, V=V,
            verb_col=spec["verb_col"], ctx_col=spec["ctx_col"],
            use_col=USE_COL, min_count=MIN_COUNT, chunk=CHUNK,
            weight_transform=WEIGHT_TRANSFORM
        )

    Xf = Xf.tocsr()
    Xf.sum_duplicates()
    mats[f] = Xf

X = assemble_matrix(feature_names, mats).tocsr()
X.sum_duplicates()
print("Assembled X shape:", X.shape, "nnz:", X.nnz)
print("Features used:", feature_names)

def fingerprint():
    obj = {
        "features": feature_names,
        "knn": int(KNN),
        "emb_dims": int(EMB_DIMS),
        "use_col": USE_COL,
        "min_count": float(MIN_COUNT),
        "verbverb_mode": VERBVERB_MODE,
        "weight_transform": WEIGHT_TRANSFORM,
        "V": int(V),
    }
    blob = json.dumps(obj, sort_keys=True).encode("utf-8")
    return hashlib.md5(blob).hexdigest(), obj

fp, fp_obj = fingerprint()

Z = None
active_ids = None

if embedding_cache_npz.exists() and embedding_cache_meta.exists():
    try:
        meta = json.loads(embedding_cache_meta.read_text(encoding="utf-8"))
        if meta.get("fingerprint") == fp:
            npz = np.load(embedding_cache_npz, allow_pickle=False)
            active_ids = npz["active_ids"].astype(np.int32)
            Z = npz["Z"].astype(np.float32)
            print("Loaded cached embedding:", embedding_cache_npz, "Z:", Z.shape)
    except Exception:
        Z = None

if Z is None:
    print("Computing embedding on CPU...")

    active = X.getnnz(axis=1) > 0
    active_ids = np.where(active)[0].astype(np.int32)
    X_act = X[active]
    n = X_act.shape[0]
    if n < 5:
        raise RuntimeError(f"Too few active verbs: {n}")

    print("Active verbs:", n)

    Xn = normalize(X_act, axis=1)

    k_eff = int(min(int(KNN), n - 1))
    if k_eff < 1:
        raise RuntimeError("k_eff < 1")

    nn = NNDescent(
        Xn,
        n_neighbors=k_eff + 1,
        metric="cosine",
        random_state=0,
        n_jobs=-1,
    )
    knn_idx, knn_dist = nn.neighbor_graph
    knn_sim = 1.0 - knn_dist

    I = np.repeat(np.arange(n, dtype=np.int32), k_eff)
    J = knn_idx[:, 1:k_eff+1].reshape(-1).astype(np.int32)
    S = knn_sim[:, 1:k_eff+1].reshape(-1).astype(np.float32)

    keep = S > 0.0
    I, J, S = I[keep], J[keep], S[keep]

    W = sp.coo_matrix((S, (I, J)), shape=(n, n), dtype=np.float32).tocsr()
    W = (W + W.T).tocsr()
    W.sum_duplicates()

    W_emb = (W + sp.eye(n, dtype=np.float32) * 1e-6).tocsr()

    emb_dim_used = int(min(int(EMB_DIMS), n - 2))
    if emb_dim_used < 2:
        raise RuntimeError(f"emb_dim_used too small: {emb_dim_used}")

    Z = SpectralEmbedding(
        n_components=emb_dim_used,
        affinity="precomputed",
        random_state=0,
    ).fit_transform(W_emb)

    Z = normalize(Z, axis=1).astype(np.float32)

    np.savez_compressed(embedding_cache_npz, active_ids=active_ids, Z=Z)
    embedding_cache_meta.write_text(json.dumps({"fingerprint": fp, "config": fp_obj}, ensure_ascii=False, indent=2),
                                    encoding="utf-8")
    print("Saved embedding cache:", embedding_cache_npz)

print("Embedding ready. Z:", Z.shape, "active_ids:", active_ids.shape)

dfc = pd.read_csv(CLUSTERS_CSV, encoding="utf-8-sig")
if "verb" not in dfc.columns:
    raise ValueError("Cluster CSV must contain 'verb' column")
if SEED_COL not in dfc.columns:
    raise ValueError(f"{SEED_COL} not found in cluster CSV")

dfc["verb"] = dfc["verb"].astype(str).str.strip()

dfc_map = dict(zip(dfc["verb"].tolist(), pd.to_numeric(dfc[SEED_COL], errors="coerce").fillna(-1).astype(int).tolist()))
labels_full = np.array([dfc_map.get(v, -1) for v in verbs], dtype=np.int32)

labels_active = labels_full[active_ids]
valid = labels_active >= 0
if not np.any(valid):
    raise RuntimeError("No labeled active verbs found (all -1?)")

K_detected = int(labels_active[valid].max() + 1)
print("Detected K (from labels):", K_detected)

verbs_active = np.array([verbs[i] for i in active_ids], dtype=object)

centroids = np.zeros((K_detected, Z.shape[1]), dtype=np.float32)
sizes = np.zeros((K_detected,), dtype=np.int32)

rows = []
for c in range(K_detected):
    idx = np.where(labels_active == c)[0]
    sizes[c] = int(idx.size)

    if idx.size == 0:
        centroids[c] = 0.0
        top_verbs = ""
    else:
        cen = Z[idx].mean(axis=0)
        cen = cen / (np.linalg.norm(cen) + 1e-12)
        centroids[c] = cen.astype(np.float32)

        sims = Z @ centroids[c]  # cosine sim
        sims_in = sims[idx]
        top_local = idx[np.argsort(-sims_in)[:TOP_N]]
        top_verbs = ", ".join(verbs_active[top_local].tolist())

    rows.append({
        "cluster": int(c),
        "size": int(sizes[c]),
        "top_verbs": top_verbs
    })

summary = pd.DataFrame(rows).sort_values("size", ascending=False).reset_index(drop=True)

np.savez_compressed(centroids_npz, centroids=centroids, sizes=sizes, clusters=np.arange(K_detected, dtype=np.int32))
summary.to_csv(centroids_summary_csv, index=False, encoding="utf-8-sig")

print("Saved centroids:", centroids_npz)
print("Saved summary:", centroids_summary_csv)
summary.head(20)