<a href="https://colab.research.google.com/github/Agnieszkachr/DH-AI/blob/main/cosine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ╔══════════════════════════════════════════════════════════╗
# ║ Validation Metrics — robust, no CLTK, Python 3.11       ║
# ╚══════════════════════════════════════════════════════════╝
!pip -q install numpy pandas scipy matplotlib

import json, itertools, math, re, unicodedata
from pathlib import Path
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr

# ──────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────
def strip_accents(text: str) -> str:
    return ''.join(c for c in unicodedata.normalize('NFD', text)
                   if unicodedata.category(c) != 'Mn')

def load_json(fname):
    return json.loads(Path(fname).read_text(encoding="utf-8"))

def build_runs_df(raw):
    rows = []
    for unit in raw["narrative_units"]:
        uid   = unit["unit_id"]
        stats = unit["validation_stats"]
        n_iter = len(next(iter(stats.values()))["results"])
        for i in range(n_iter):
            vec = {cat: stats[cat]["results"][i] for cat in stats}
            rows.append({"unit": uid, "run": f"r{i:02d}", **vec})
    return pd.DataFrame(rows)

def normalise_uid(u):
    """Turn anything like 'unit_001', 'Unit1', '1', 1 → 'unit_001'."""
    m = re.search(r'\d+', str(u))
    return f"unit_{int(m.group()):03d}" if m else str(u).strip()

cats = ["Worship & Praise",
        "Judicial Wrath & Punitive Action",
        "Lament, Persecution & Endurance",
        "Victory, Consolation & New-Creation Hope",
        "Cosmic Warfare & Deception",
        "Prophetic Exhortation & Warning",
        "Theophanic Awe & Terror"]

# ──────────────────────────────────────────────────────────
# Load data
# ──────────────────────────────────────────────────────────
gem_runs = build_runs_df(load_json("gemini_flash.json"))
mis_runs = build_runs_df(load_json("mistral_saba24.json"))

gem_runs["unit"] = gem_runs["unit"].map(normalise_uid)
mis_runs["unit"] = mis_runs["unit"].map(normalise_uid)

# Warn if any unit lacks 10 runs
if gem_runs.groupby("unit")["run"].nunique().ne(10).any():
    bad = gem_runs.groupby("unit")["run"].nunique().loc[lambda s: s.ne(10)]
    print("⚠️ Units with ≠10 runs:", bad.to_dict())

# ──────────────────────────────────────────────────────────
# 1️⃣  Intra-model stability (Gemini)
# ──────────────────────────────────────────────────────────
def mean_pairwise_cos(df):
    vals = []
    for _, sub in df.groupby("unit"):
        vecs = sub[cats].to_numpy()
        vals.append(np.mean([1 - cosine(a, b)
                             for a, b in itertools.combinations(vecs, 2)]))
    return np.mean(vals)

intra_cos = mean_pairwise_cos(gem_runs)
print(f"Mean pair-wise cosine (Gemini-Flash): {intra_cos:.3f}")

# ──────────────────────────────────────────────────────────
# 2️⃣  Cross-model replication
# ──────────────────────────────────────────────────────────
unit_mean = lambda d: d.groupby("unit")[cats].mean().sort_index()
gem_mean  = unit_mean(gem_runs)
mis_mean  = unit_mean(mis_runs)

rho, p = spearmanr(gem_mean.to_numpy().flatten(),
                   mis_mean.to_numpy().flatten())
print(f"Spearman ρ (Gemini vs Mistral): {rho:.3f}  (p = {p:.4g})")

# ──────────────────────────────────────────────────────────
# 3️⃣  Lemma-frequency proxy  (regex only, no CLTK)
# ──────────────────────────────────────────────────────────
try:
    greek_texts = load_json("unit_texts_greek.json")
except FileNotFoundError:
    print("unit_texts_greek.json not found → skipping lemma proxy test.")
else:
    token_re      = re.compile(r"[\u0370-\u03FF\u1F00-\u1FFF]+")
    target_lemmas = {"αξιος", "προσκυνεω", "ευλογια", "δοξα", "τιμη", "κρατος"}

    lemma_counts = {}
    for uid, txt in greek_texts.items():
        # Flatten lists / dict values into one string
        if not isinstance(txt, str):
            txt = " ".join(map(str, txt if isinstance(txt, list)
                               else txt.values()))
        tokens = [strip_accents(t.lower()) for t in token_re.findall(txt)]
        lemma_counts[normalise_uid(uid)] = sum(t in target_lemmas for t in tokens)

    lemma_df = pd.DataFrame(lemma_counts.items(),
                             columns=["unit", "lemma_count"])
    merged   = gem_mean.reset_index().rename(columns={"unit": "unit_id"}) \
                       .assign(unit=lambda df: df["unit_id"].map(normalise_uid)) \
                       .merge(lemma_df, on="unit", how="inner")

    if merged["lemma_count"].nunique() < 2:
        print("All lemma counts identical → correlation undefined.")
        r = p_l = np.nan
    else:
        r, p_l = spearmanr(merged["lemma_count"], merged["Worship & Praise"])
        print(f"Lemma proxy vs ‘Worship & Praise’: r = {r:.3f}  (p = {p_l:.4g})")

    # Scatter plot if we have >0 points
    if not merged.empty:
        plt.scatter(merged["lemma_count"], merged["Worship & Praise"], alpha=.7)
        plt.xlabel("Cultic lemma count per unit")
        plt.ylabel("Gemini ‘Worship & Praise’ score")
        plt.title("Lemma proxy vs model score")
        plt.grid(True); plt.show()

# ──────────────────────────────────────────────────────────
# 4️⃣  Summary
# ──────────────────────────────────────────────────────────
print("\nSummary:")
print(" • Internal stability: cosine ≥ 0.90 is excellent.")
print(" • Cross-model agreement: |ρ| ≥ 0.70 is strong (p ≪ .05).")
print(" • A positive, significant lemma-proxy r suggests the model’s "
      "‘Worship & Praise’ vector tracks real cultic vocabulary.")

Mean pair-wise cosine (Gemini-Flash): 1.000
Spearman ρ (Gemini vs Mistral): 0.786  (p = 4.577e-32)
All lemma counts identical → correlation undefined.

Summary:
 • Internal stability: cosine ≥ 0.90 is excellent.
 • Cross-model agreement: |ρ| ≥ 0.70 is strong (p ≪ .05).
 • A positive, significant lemma-proxy r suggests the model’s ‘Worship & Praise’ vector tracks real cultic vocabulary.
