In [1]:
"""
f_5_average_diversity.ipynb
───────────────────────────────────────────────────────────────────────────────
Average diversity vs augmentation size (fold-agnostic) and correlation with
performance gains for linear and tree-based MTRs.

This script:
1) Computes **Average diversity** of the actual training set geometry used at each
   augmentation level, measured as the **mean pairwise cosine dissimilarity**
   (1 − cosine) in the **e5_base** embedding space. We use a fold-agnostic setup:
   the set is **all 96 seeds** plus the first K synthetics from the ordered
   Script-A finals (method = `gemma`).
   • Efficient computation uses a streaming identity to avoid n×n Gram matrices:
     AvgDiv = 1 − (‖∑u_i‖² − n) / (n(n − 1)) with unit vectors u_i.
2) Loads per-fold RRMSE results from Script-C (student scoring) and aggregates the
   **global median RRMSE** (median over all folds×domains) for the baseline (0%)
   and each augmentation size.
3) Builds two report tables (CSV):
   • Linear models: {chain_lr, local_lasso}.
   • Tree-based models: {local_rf, global_rf, chain_rf}.
   Each table mirrors the article’s layout (columns = {0,10,20,50,100,200,400}),
   with model rows showing global-median RRMSE and a bottom row showing
   **Avg. diversity (1−cos) [E5]** as absolute values, with **relative change vs 0%**
   in parentheses.
4) Computes **Spearman ρ** between AvgDiv and **ΔRRMSE** (baseline − full) for
   the linear group and for the tree-based group (family-level Δ is the mean of
   member models at each %K).
5) Performs a **monotonicity check** for AvgDiv across {0,10,20,50,100,200,400}.

Inputs:
- outputs/e_1_synth_augmentation/g_final_n3072_gemma.csv
- outputs/results/e5_base_vectors.npy (or fallback to outputs/e_3_student_scoring/cache/X_seed_e5_base.npy)
- outputs/e_2_teacher_labeling/cache/synth_embeds/g_final_n3072_gemma__e5_base.npy
  and companion index CSV (or rebuilt on the fly if missing)
- outputs/e_3_student_scoring/results/
    rrmse_perfold_e5_base__{reg}__gemma__pct{P}_K{K}__Mmax{M}__baseline.csv
    rrmse_perfold_e5_base__{reg}__gemma__pct{P}_K{K}__Mmax{M}__full.csv

Outputs:
- outputs/f_final_report/f_5_average_diversity/
    run.log
    tables/
      linear_rrmse_plus_diversity.csv
      tree_rrmse_plus_diversity.csv
      diversity_values.csv
    stats/
      spearman_and_monotonicity.txt

Notes:
- Embedding space is fixed to **e5_base** for primary reporting.
- Method is fixed to **gemma** (match your Script-A/Script-C runs).
- Augmentation sizes follow the percent-based protocol used in Script-C:
  %K ∈ {10, 20, 50, 100, 200, 400} with K = round(% × 96 / 100) = {10,19,48,96,192,384}.
- The baseline (0%) diversity is computed on the 96 seeds only.
- The baseline RRMSE shown in column 0 is taken from the baseline variant files
  (these are seeds-only predictions; Script-C already writes them per %K for
  naming symmetry and they should be identical across %K).

"""

# ─────────────────────────────────────────────────────────────────────────────
# Imports
# ─────────────────────────────────────────────────────────────────────────────
from __future__ import annotations

import logging
import os
import re
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from scipy.stats import spearmanr

try:
    # Optional, only needed if synth caches are missing and we need to rebuild.
    from sentence_transformers import SentenceTransformer
except Exception:
    SentenceTransformer = None  # guard for environments where not installed

# ─────────────────────────────────────────────────────────────────────────────
# Paths & constants (same style as other scripts)
# ─────────────────────────────────────────────────────────────────────────────

def project_root(marker: str = "LICENSE") -> Path:
    here = Path.cwd().resolve()
    for d in (here, *here.parents):
        if (d / marker).is_file():
            return d
    return Path.cwd().resolve()

ROOT = project_root()
os.chdir(ROOT)

DATA_DIR = ROOT / "data"
OUT_DIR  = ROOT / "outputs" / "f_final_report" / "f_5_average_diversity"
TABLES_DIR = OUT_DIR / "tables"
STATS_DIR  = OUT_DIR / "stats"

G1_DIR  = ROOT / "outputs" / "e_1_synth_augmentation"
G2_DIR  = ROOT / "outputs" / "e_2_teacher_labeling"
G3_DIR  = ROOT / "outputs" / "e_3_student_scoring"  
RES_DIR = G3_DIR / "results"

for p in (TABLES_DIR, STATS_DIR):
    p.mkdir(parents=True, exist_ok=True)

LOG_FILE = OUT_DIR / "run.log"
for h in list(logging.root.handlers):
    logging.root.removeHandler(h)
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[
        logging.FileHandler(str(LOG_FILE), mode="a", encoding="utf-8"),
        logging.StreamHandler(getattr(__import__('sys'), "__stdout__", None) or __import__('sys').stdout),
    ],
    force=True,
)
log = logging.getLogger(__name__)

# ─────────────────────────────────────────────────────────────────────────────
# Config
# ─────────────────────────────────────────────────────────────────────────────
METHOD = "gemma"
EMB_KEY = "e5_base"
EMBEDDING_REPO = "embaas/sentence-transformers-multilingual-e5-base"

N_SEEDS = 96
PCTS = [10, 20, 50, 100, 200, 400]
PCT_TO_K = {p: max(1, int(round(p * N_SEEDS / 100.0))) for p in PCTS}  # {10:10,20:19,48,96,192,384}
K_LIST = [PCT_TO_K[p] for p in PCTS]
COLS = [0] + PCTS  # table columns (0% baseline + augmented levels)

TARGET_COLS = [f"rrmse_domain{i}" for i in range(1, 15)]  # columns in per-fold files

# Model groups (map file keys → display names to match article style)
LINEAR_MODELS = ["chain_ERCcv_lr", "local_lasso"]
TREE_MODELS   = ["local_rf", "global_rf", "chain_ERCcv_rf"]
DISPLAY_NAME = {
    "chain_ERCcv_lr": "chain_lr",
    "local_lasso": "local_lasso",
    "local_rf": "local_rf",
    "global_rf": "global_rf",
    "chain_ERCcv_rf": "chain_rf",
}

# ─────────────────────────────────────────────────────────────────────────────
# Embedding utilities (normalized vectors and streaming diversity)
# ─────────────────────────────────────────────────────────────────────────────

def _unit_rows(X: np.ndarray) -> np.ndarray:
    X = np.asarray(X, dtype=np.float32)
    nrm = np.linalg.norm(X, axis=1, keepdims=True)
    nrm = np.clip(nrm, 1e-9, None)
    return X / nrm


def avg_div_from_sum(s: np.ndarray, n: int) -> float:
    """Average pairwise (1 − cosine) using only the sum of unit vectors.
    AvgDiv = 1 − (||sum(u_i)||^2 − n) / (n(n−1)).
    """
    if n <= 1:
        return 0.0
    s2 = float(np.dot(s, s))
    return 1.0 - (s2 - n) / (n * (n - 1))


# ─────────────────────────────────────────────────────────────────────────────
# Loading seeds and synthetics (reusing Script-C conventions where possible)
# ─────────────────────────────────────────────────────────────────────────────

def load_seed_vectors_e5() -> np.ndarray:
    """Load seed sentence embeddings for EMB_KEY. Prefer Script-C cache.
    Tries outputs/results/e5_base_vectors.npy, then e_3_student_scoring/cache.
    """
    pref = ROOT / "outputs" / "results" / f"{EMB_KEY}_vectors.npy"
    if pref.exists():
        X = np.load(pref).astype(np.float32, copy=False)
        if X.shape[0] != N_SEEDS:
            log.warning("Seed vectors found but unexpected row count: %s", X.shape)
        return X
    alt = G3_DIR / "cache" / f"X_seed_{EMB_KEY}.npy"
    if alt.exists():
        X = np.load(alt).astype(np.float32, copy=False)
        return X
    # As a last resort, rebuild via SentenceTransformer
    if SentenceTransformer is None:
        raise RuntimeError("SentenceTransformer not available to rebuild seed vectors.")
    log.info("Recomputing seed embeddings with %s …", EMBEDDING_REPO)
    acts = pd.read_csv(DATA_DIR / "activities.csv")
    acts = acts.sort_values("activity_id")
    texts = acts["question"].astype(str).tolist()
    if len(texts) != N_SEEDS:
        raise RuntimeError(f"Expected {N_SEEDS} seed texts, got {len(texts)}")
    mdl = SentenceTransformer(EMBEDDING_REPO)
    X = mdl.encode(texts, batch_size=64, show_progress_bar=False, convert_to_numpy=True)
    X = X.astype(np.float32, copy=False)
    (ROOT / "outputs" / "results").mkdir(parents=True, exist_ok=True)
    np.save(pref, X)
    np.save(alt, X)
    return X


def g1_source_csv(M: int) -> Path:
    p = G1_DIR / f"g_final_n{M}_{METHOD}.csv"
    if not p.exists():
        raise FileNotFoundError(f"Missing Script-A source: {p}")
    return p


def synth_cache_paths(M: int) -> Tuple[Path, Path]:
    """Return (npy, index.csv) for Script-A finals under the given embedding.
    Uses Script-C cache layout; rebuilds if missing (requires SentenceTransformer).
    """
    base_csv = g1_source_csv(M)
    base_tag = base_csv.stem.replace("g_final_", "")  # n{M}_{method}
    cache_dir = G2_DIR / "cache" / "synth_embeds"
    npy = cache_dir / f"g_final_{base_tag}__{EMB_KEY}.npy"
    idx = cache_dir / f"g_final_{base_tag}__index.csv"
    if npy.exists() and idx.exists():
        return npy, idx
    # Rebuild cache if needed
    if SentenceTransformer is None:
        raise FileNotFoundError(f"Synth cache missing and no ST model to rebuild: {npy.name}")
    log.info("Building synth cache for %s, M=%d …", METHOD, M)
    df = pd.read_csv(base_csv)
    if "text" not in df.columns:
        raise ValueError("Finals CSV missing 'text' column")
    texts = df["text"].astype(str).tolist()
    mdl = SentenceTransformer(EMBEDDING_REPO)
    X = mdl.encode(texts, batch_size=64, show_progress_bar=False, convert_to_numpy=True)
    X = X.astype(np.float32, copy=False)
    cache_dir.mkdir(parents=True, exist_ok=True)
    np.save(npy, X)
    pd.DataFrame({"text": texts}).to_csv(idx, index=False)
    log.info("✔ Saved synth cache → %s ; %s", npy.relative_to(ROOT), idx.relative_to(ROOT))
    return npy, idx


def discover_M_max_for_method() -> int:
    """Discover available M for METHOD under G2 labels (Script-C style); pick max.
    Fallback to 3072 if discovery fails but finals exist for that size.
    """
    pats = sorted(G2_DIR.glob(f"g2f_labels_fold00_n*_{METHOD}__{EMB_KEY}__*.csv"))
    M_vals = []
    for p in pats:
        m = re.match(rf"g2f_labels_fold00_n(\d+)_({METHOD})__{EMB_KEY}__.*\\.csv$", p.name)
        if m:
            M_vals.append(int(m.group(1)))
    if M_vals:
        return max(M_vals)
    # fallback if labels not present but finals exist
    for M in (3072, 1536, 768):
        if (G1_DIR / f"g_final_n{M}_{METHOD}.csv").exists():
            return M
    raise RuntimeError("Could not discover M_max; ensure Script-A/B/C outputs exist.")


# ─────────────────────────────────────────────────────────────────────────────
# RRMSE aggregation utilities (global median over folds×domains)
# ─────────────────────────────────────────────────────────────────────────────

def rrmse_file_path(reg: str, pct: int, K: int, Mmax: int, variant: str) -> Path:
    return RES_DIR / (
        f"rrmse_perfold_{EMB_KEY}__{reg}__{METHOD}__pct{pct}_K{K}__Mmax{Mmax}__{variant}.csv"
    )


def global_median_rrmse_from_file(p: Path) -> float:
    df = pd.read_csv(p)
    dom_cols = [c for c in df.columns if c.startswith("rrmse_domain")]
    if dom_cols:
        arr = df[dom_cols].to_numpy(dtype=np.float32).ravel()
        return float(np.median(arr))
    # Fallback: median over per-fold medians (approximation)
    if "median_rrmse_fold" in df.columns:
        return float(np.median(df["median_rrmse_fold"].to_numpy(dtype=np.float32)))
    raise ValueError(f"{p.name}: no RRMSE columns found.")


# ─────────────────────────────────────────────────────────────────────────────
# Monotonicity check
# ─────────────────────────────────────────────────────────────────────────────

def monotone_non_decreasing(values: List[float], tol: float = 1e-9) -> Tuple[bool, int | None]:
    """Return (ok, first_violation_index). Index refers to COLS order.
    Accepts tiny numerical noise via tol.
    """
    for i in range(1, len(values)):
        if values[i] + tol < values[i - 1]:
            return False, i
    return True, None


# ─────────────────────────────────────────────────────────────────────────────
# Main
# ─────────────────────────────────────────────────────────────────────────────

def main():
    log.info("=== f_5_average_diversity started ===")

    # 1) Load seed vectors and compute sum of unit vectors
    X_seed = load_seed_vectors_e5()
    U_seed = _unit_rows(X_seed)
    s_seed = U_seed.sum(axis=0)
    n_seed = U_seed.shape[0]
    if n_seed != N_SEEDS:
        log.warning("Seed vector count = %d (expected %d)", n_seed, N_SEEDS)

    # 2) Load synthetics at M_max and build prefix sums over unit vectors
    Mmax = discover_M_max_for_method()
    npy, idx = synth_cache_paths(Mmax)
    X_syn = np.load(npy).astype(np.float32, copy=False)
    U_syn = _unit_rows(X_syn)
    # prefix sums S[k] = sum of first k rows (1-based logic handled below)
    S_prefix = np.vstack([np.zeros((1, U_syn.shape[1]), dtype=np.float32), np.cumsum(U_syn, axis=0)])

    # 3) Diversity per column (0 + augmented levels)
    div_abs: Dict[int, float] = {}
    div_rel: Dict[int, float] = {}

    # 0% baseline: seeds only
    d0 = avg_div_from_sum(s_seed, n_seed)
    div_abs[0] = d0

    for pct in PCTS:
        K = PCT_TO_K[pct]
        s = s_seed + S_prefix[K]
        n = n_seed + K
        div_abs[pct] = avg_div_from_sum(s, n)

    for c in COLS:
        div_rel[c] = 0.0 if c == 0 else 100.0 * (div_abs[c] - d0) / max(d0, 1e-12)

    # 4) Build RRMSE tables (global median) for linear and tree-based models
    def build_table_for(models: List[str]) -> pd.DataFrame:
        rows = []
        for reg in models:
            disp = DISPLAY_NAME.get(reg, reg)
            row = {"Model": disp}
            # Column 0 from any baseline file (use first PCTS entry)
            p0 = rrmse_file_path(reg, PCTS[0], PCT_TO_K[PCTS[0]], Mmax, variant="baseline")
            if not p0.exists():
                raise FileNotFoundError(f"Baseline file missing: {p0.name}")
            row[0] = global_median_rrmse_from_file(p0)
            # Augmented columns from FULL files
            for pct in PCTS:
                p_full = rrmse_file_path(reg, pct, PCT_TO_K[pct], Mmax, variant="full")
                if not p_full.exists():
                    raise FileNotFoundError(f"Full file missing: {p_full.name}")
                row[pct] = global_median_rrmse_from_file(p_full)
            rows.append(row)
        # Diversity bottom row with absolute value and (+rel%) per cell
        div_row = {"Model": "Avg. diversity (1−cos) [E5]"}
        for c in COLS:
            div_row[c] = f"{div_abs[c]:.6f} ({div_rel[c]:+.1f}%)"
        df = pd.DataFrame(rows)
        df = df[["Model", *COLS]]
        df = pd.concat([df, pd.DataFrame([div_row])], axis=0, ignore_index=True)
        return df

    df_linear = build_table_for(LINEAR_MODELS)
    df_tree   = build_table_for(TREE_MODELS)

    # 5) Spearman ρ between AvgDiv and ΔRRMSE per family (mean Δ across models)
    # Compute Δ per model at each pct (baseline − full). Baseline: use column 0 values.
    def deltas_by_family(models: List[str]) -> List[float]:
        # Gather Δ for each model×pct, then average across models for each pct
        deltas_per_pct: Dict[int, List[float]] = {pct: [] for pct in PCTS}
        # Build a temp lookup from the just-built tables
        tmp = build_table_for(models)
        tmp = tmp[tmp["Model"] != "Avg. diversity (1−cos) [E5]"]
        base_by_model = {row["Model"]: float(row[0]) for _, row in tmp.iterrows()}
        for _, row in tmp.iterrows():
            model = row["Model"]
            base_val = base_by_model[model]
            for pct in PCTS:
                deltas_per_pct[pct].append(base_val - float(row[pct]))
        return [float(np.mean(deltas_per_pct[p])) for p in PCTS]

    deltas_lin = deltas_by_family(LINEAR_MODELS)
    deltas_tree = deltas_by_family(TREE_MODELS)

    # Spearman uses the six augmented points; AvgDiv sequence excludes the 0% cell
    div_series = [div_abs[p] for p in PCTS]
    rho_lin, p_lin = spearmanr(div_series, deltas_lin)
    rho_tree, p_tree = spearmanr(div_series, deltas_tree)

    # 6) Monotonicity check for AvgDiv across [0,10,20,50,100,200,400]
    div_all = [div_abs[c] for c in COLS]
    ok, idx = monotone_non_decreasing(div_all)

    # 7) Save outputs
    out_linear = TABLES_DIR / "linear_rrmse_plus_diversity.csv"
    out_tree   = TABLES_DIR / "tree_rrmse_plus_diversity.csv"
    out_divcsv = TABLES_DIR / "diversity_values.csv"

    df_linear.to_csv(out_linear, index=False)
    df_tree.to_csv(out_tree, index=False)
    div_df = pd.DataFrame({
        "pct": COLS,
        "K": [0] + K_LIST,
        "avg_div": div_all,
        "rel_change_pct": [div_rel[c] for c in COLS],
    })
    div_df.to_csv(out_divcsv, index=False)

    stats_txt = STATS_DIR / "spearman_and_monotonicity.txt"
    with open(stats_txt, "w", encoding="utf-8") as f:
        f.write("Spearman ρ between AvgDiv and ΔRRMSE (baseline − full)\n")
        f.write(f"  • Linear (chain_lr + local_lasso):    rho = {rho_lin:.3f}, p = {p_lin:.3g}\n")
        f.write(f"  • Tree-based (local_rf + global_rf + chain_rf): rho = {rho_tree:.3f}, p = {p_tree:.3g}\n\n")
        f.write("Monotonicity check for AvgDiv (non-decreasing): ")
        if ok:
            f.write("PASS across [0,10,20,50,100,200,400].\n")
        else:
            bad_at = COLS[idx] if idx is not None else None
            f.write(f"FAIL (first decrease at column % = {bad_at}).\n")

    # Print full report to console as well
    import pandas as _pd
    _pd.set_option("display.max_columns", None)
    _pd.set_option("display.width", 140)

    def _fmt_numeric_cols(df):
        df2 = df.copy()
        for c in COLS:
            # Keep diversity row strings intact
            if c in df2.columns and _pd.api.types.is_numeric_dtype(df2[c]):
                df2[c] = df2[c].map(lambda x: f"{x:.3f}")
        return df2

    # Also print to screen in the requested order/format
    def _df_console_str(df: pd.DataFrame) -> str:
        df_print = df.copy()
        # format RRMSE cells (all rows except the last diversity row)
        for i in range(len(df_print) - 1):
            for c in COLS:
                df_print.at[i, c] = f"{float(df_print.at[i, c]):.3f}"
        return df_print.to_string(index=False)

    print("\nLinear models (chain_lr, local_lasso)")
    print(_df_console_str(df_linear))
    print(f"Spearman \u03c1 (AvgDiv vs \u0394RRMSE), linear: rho = {rho_lin:.3f}, p = {p_lin:.3g}\n")

    print("Tree-based models (local_rf, global_rf, chain_rf)")
    print(_df_console_str(df_tree))
    print(f"Spearman \u03c1 (AvgDiv vs \u0394RRMSE), tree-based: rho = {rho_tree:.3f}, p = {p_tree:.3g}\n")

    # Monotonicity check printed last
    if ok:
        print("Monotonicity check for AvgDiv: PASS across [0, 10, 20, 50, 100, 200, 400].")
    else:
        bad_at = COLS[idx] if idx is not None else None
        print(f"Monotonicity check for AvgDiv: FAIL (first decrease at %= {bad_at}).")

    log.info("=== f_5_average_diversity completed ===")


if __name__ == "__main__":
    main()




  from tqdm.autonotebook import tqdm, trange


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
2025-10-16 00:40:02 INFO: === f_5_average_diversity started ===

Linear models (chain_lr, local_lasso)
                      Model                0               10               20               50              100              200              400
                   chain_lr            0.689            0.674            0.668            0.674            0.685            0.685            0.685
                local_lasso            0.705            0.690            0.688            0.678            0.679            0.685            0.698
Avg. diversity (1−cos) [E5] 0.164278 (+0.0%) 0.164011 (-0.2%) 0.164453 (+0.1%) 0.165774 (+0.9%) 0.167564 (+2.0%) 0.169500 (+3.2%) 0.171336 (+4.3%)
Spearman ρ (AvgDiv vs ΔRRMSE), linear: rho = -0.600, p = 0.208

Tree-based models (local_rf, global_rf, chain_rf)
                      Model                0               10               20               50              100              200        

  warn("The installed version of bitsandbytes was compiled without GPU support. "
