In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Part E — FCA Significance (Null Model) + Bridge Case Studies
Assumes Parts A, B, C are done.

Outputs under: e_significance_<timestamp>/
  - fca_significance.csv           (observed vs null; p-values + FDR)
  - significant_pairs_topK.csv     (top significant FCA pairs)
  - cases/ per-pair CSVs with top bridge authors + counts + CPR
  - plots/ (optional small figures if enabled)
  - README.txt
"""

import os, json, math, itertools, time
from datetime import datetime
from collections import Counter, defaultdict

import numpy as np
import pandas as pd

# ---------------- Config ----------------
YEARS = list(range(2018, 2026))           # 2018–2025
ARTICLES_DIR_TEMPLATE = "articles_{year}_new"
ARTICLES_FILE = "all_articles_enhanced.jsonl"

N_NULL = 300                               # איטרציות למודל האפס (אפשר 200–500)
TOP_CASES = 20                             # כמה מחברי-גשר להציג לכל זוג
BRIDGE_QUANTILE = 0.90                     # הגדרת מחברי-גשר (כמו ב-C/D)
MIN_YEARS_PER_PAIR = 3                     # כמו ב-A
LIMIT_PAIRS_TO_FCA = True                  # True: מחשבים מובהקות רק לזוגות שקיבלו FCA ב-A (יעיל יותר)
RANDOM_SEED = 7
# ----------------------------------------

rng = np.random.default_rng(RANDOM_SEED)

def find_latest_dir(prefix: str):
    cands = [d for d in os.listdir(".") if d.startswith(prefix) and os.path.isdir(d)]
    if not cands: return None
    cands.sort(reverse=True)
    return cands[0]

def ensure_outdir():
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_dir = f"e_significance_{ts}"
    os.makedirs(out_dir, exist_ok=True)
    os.makedirs(os.path.join(out_dir, "cases"), exist_ok=True)
    os.makedirs(os.path.join(out_dir, "plots"), exist_ok=True)
    return out_dir

def safe_pair(a,b):
    return tuple(sorted((str(a), str(b))))

# ---------- Load Part A/B/C ----------
def load_partA():
    a_dir = find_latest_dir("field_convergence_A_")
    if not a_dir: raise FileNotFoundError("Missing field_convergence_A_*")
    fca_path = os.path.join(a_dir, "fca_summary.csv")
    per_year_w_path = os.path.join(a_dir, "per_year_field_weights.csv")
    if not os.path.exists(fca_path) or not os.path.exists(per_year_w_path):
        raise FileNotFoundError("Part A outputs missing")
    df_fca = pd.read_csv(fca_path)
    df_fca["pair"] = list(map(lambda xy: safe_pair(xy[0], xy[1]), zip(df_fca["field_i"], df_fca["field_j"])))
    df_fca = df_fca[df_fca["years_covered"] >= MIN_YEARS_PER_PAIR].copy()
    return a_dir, df_fca, per_year_w_path

def load_partB():
    b_dir = find_latest_dir("bridge_emergence_B_")
    if not b_dir: raise FileNotFoundError("Missing bridge_emergence_B_*")
    bes_path = os.path.join(b_dir, "bes_summary.csv")
    if not os.path.exists(bes_path): raise FileNotFoundError("Missing bes_summary.csv")
    df_bes = pd.read_csv(bes_path)
    df_bes["author_id"] = df_bes["author_id"].astype(str)
    thr = df_bes["BES_delta"].quantile(BRIDGE_QUANTILE)
    bridge_authors = set(df_bes.loc[df_bes["BES_delta"] >= thr, "author_id"])
    return b_dir, df_bes, bridge_authors

# ---------- Read articles ----------
def stream_articles(year):
    folder = ARTICLES_DIR_TEMPLATE.format(year=year)
    path = os.path.join(folder, ARTICLES_FILE)
    if not os.path.exists(path): return
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line: continue
            try:
                yield json.loads(line)
            except Exception:
                continue

def collect_paper_fields_for_year(year):
    """
    Returns:
      papers: list of (paper_id, fields_unique_sorted)
      field_counts: Counter of field frequency across papers (1 per paper)
      authors_per_paper: dict paper_id -> [author_ids]
      cpr_per_paper: dict paper_id -> CPR (citations per reference)
    """
    papers = []
    field_counts = Counter()
    authors_per_paper = {}
    cpr_per_paper = {}
    for obj in stream_articles(year) or []:
        fields = obj.get("fields") or []
        fields = [str(x).strip() for x in fields if x and str(x).strip()]
        fields = sorted(set(fields))
        if not fields:
            continue
        pid = obj.get("id")
        papers.append((pid, fields))
        field_counts.update(fields)

        # authors
        authors = obj.get("authors") or []
        aids = []
        for a in authors:
            aid = a.get("author_id")
            if aid: aids.append(str(aid))
        authors_per_paper[pid] = list(dict.fromkeys(aids))

        # CPR
        cpr = obj.get("citation_per_reference_ratio")
        if cpr is None:
            ref = obj.get("referenced_works_count") or 1
            cited = obj.get("cited_by_count") or 0
            cpr = float(cited)/float(ref)
        cpr_per_paper[pid] = float(cpr)

    return papers, field_counts, authors_per_paper, cpr_per_paper

def pair_counts_from_papers(papers):
    """
    Given list[(pid, fields)], compute:
      field_count: Counter field->#papers
      pair_count : Counter (f_i,f_j)->#papers
    """
    field_count = Counter()
    pair_count = Counter()
    for pid, fields in papers:
        field_count.update(fields)
        if len(fields) > 1:
            for a,b in itertools.combinations(fields, 2):
                pair_count[safe_pair(a,b)] += 1
    return field_count, pair_count

def cosine_weight(co_ij, c_i, c_j, eps=1e-12):
    denom = math.sqrt(max(c_i,0)*max(c_j,0)) + eps
    return co_ij/denom

# ---------- Null model ----------
def sample_fields_for_paper(L, field_universe, probs):
    """
    Weighted sample without replacement of L fields.
    If L > len(universe), fall back to with-replacement then unique.
    """
    L = int(L)
    if L <= 0:
        return []
    if L >= len(field_universe):
        # edge case
        return field_universe[:]
    idxs = rng.choice(len(field_universe), size=L, replace=False, p=probs)
    return [field_universe[i] for i in idxs]

def build_null_year_papers(real_papers, field_counts):
    """
    For a given year:
      - keep #fields per paper
      - sample fields for each paper from weighted global distribution
    """
    fields = list(field_counts.keys())
    weights = np.array([field_counts[f] for f in fields], dtype=float)
    probs = weights/weights.sum() if weights.sum()>0 else np.ones_like(weights)/len(weights)

    null_papers = []
    for pid, real_fields in real_papers:
        L = len(real_fields)
        nf = sample_fields_for_paper(L, fields, probs)
        null_papers.append((pid, sorted(set(nf))))
    return null_papers

# ---------- FCA slope ----------
def slope(xs, ys):
    xs = np.asarray(xs, float); ys = np.asarray(ys, float)
    if len(xs) < 2: return np.nan
    m, b = np.polyfit(xs, ys, 1)
    return float(m)

def compute_year_weights_for_pairs(papers_by_year, target_pairs):
    """
    For each year, compute cosine weight for target_pairs only (efficient).
    Returns dict: pair -> list[(year, weight)]
    """
    out = defaultdict(list)
    for year, papers in papers_by_year.items():
        field_count, pair_count = pair_counts_from_papers(papers)
        for (a,b) in target_pairs:
            co = pair_count.get((a,b), 0)
            w = cosine_weight(co, field_count.get(a,0), field_count.get(b,0))
            out[(a,b)].append((year, w))
    return out

def main():
    out_dir = ensure_outdir()

    # Load A/B
    a_dir, df_fca, _ = load_partA()
    b_dir, df_bes, bridge_authors = load_partB()

    # -------- Target pairs for significance --------
    if LIMIT_PAIRS_TO_FCA:
        target_pairs = set(df_fca["pair"].tolist())  # כל הזוגות שקיבלו FCA ב-A
    else:
        target_pairs = set(df_fca["pair"].tolist())

    # -------- Collect real data per year --------
    real_papers_by_year = {}
    authors_per_paper_by_year = {}
    cpr_per_paper_by_year = {}
    field_counts_by_year = {}

    for y in YEARS:
        papers, field_counts, a_per_p, cpr_per_p = collect_paper_fields_for_year(y)
        if papers:
            real_papers_by_year[y] = papers
            field_counts_by_year[y] = field_counts
            authors_per_paper_by_year[y] = a_per_p
            cpr_per_paper_by_year[y] = cpr_per_p

    # -------- Observed slopes for target pairs --------
    obs_weights = compute_year_weights_for_pairs(real_papers_by_year, target_pairs)
    obs_rows = []
    for pair, lst in obs_weights.items():
        lst = sorted(lst, key=lambda x: x[0])
        years = [t for t,_ in lst]
        vals  = [w for _,w in lst]
        if len(years) < MIN_YEARS_PER_PAIR: 
            continue
        m = slope(years, vals)
        obs_rows.append({"pair":pair, "field_i":pair[0], "field_j":pair[1], "obs_slope":m, 
                         "years_covered":len(years), "year_first":min(years), "year_last":max(years)})
    df_obs = pd.DataFrame(obs_rows)

    # -------- Null slopes (permutation across years) --------
    # Precompute per-year null papers N_NULL times and reuse for all pairs
    null_slopes = defaultdict(list)  # pair -> [m1,m2,...]

    for it in range(N_NULL):
        t0 = time.time()
        null_papers_by_year = {}
        for y in YEARS:
            if y not in real_papers_by_year: continue
            rp = real_papers_by_year[y]
            fc = field_counts_by_year[y]
            null_papers_by_year[y] = build_null_year_papers(rp, fc)

        null_weights = compute_year_weights_for_pairs(null_papers_by_year, target_pairs)
        for pair, lst in null_weights.items():
            lst = sorted(lst, key=lambda x: x[0])
            years = [t for t,_ in lst]
            vals  = [w for _,w in lst]
            if len(years) < MIN_YEARS_PER_PAIR:
                continue
            m = slope(years, vals)
            null_slopes[pair].append(m)

        if (it+1) % max(1, N_NULL//10) == 0:
            print(f"[Null] iteration {it+1}/{N_NULL} done in {time.time()-t0:.1f}s")

    # -------- Significance table --------
    rows = []
    for _, r in df_obs.iterrows():
        pair = r["pair"]
        null_list = null_slopes.get(pair, [])
        if len(null_list) == 0:
            rows.append({**r.to_dict(), "null_mean":np.nan, "null_std":np.nan,
                         "z":np.nan, "p_one_sided":np.nan})
            continue
        arr = np.asarray(null_list, float)
        mu  = float(np.mean(arr))
        sd  = float(np.std(arr, ddof=1)) if len(arr)>1 else np.nan
        # one-sided p for positive acceleration (obs larger than null)
        p = float((np.sum(arr >= r["obs_slope"]) + 1) / (len(arr) + 1))
        z = (r["obs_slope"]-mu)/sd if (sd and sd>0) else np.nan
        rows.append({**r.to_dict(), "null_mean":mu, "null_std":sd, "z":z, "p_one_sided":p})

    df_sig = pd.DataFrame(rows).sort_values(["p_one_sided","obs_slope"], ascending=[True,False]).reset_index(drop=True)

    # Benjamini-Hochberg FDR
    def bh_fdr(pvals):
        p = np.asarray(pvals, float)
        n = len(p)
        order = np.argsort(p)
        ranked = p[order]
        q = np.empty(n, float); q[:] = np.nan
        min_coeff = 1.0
        for i in range(n-1, -1, -1):
            rank = i+1
            val = ranked[i]*n/rank
            if val < min_coeff: min_coeff = val
            q[i] = min(1.0, min_coeff)
        out = np.empty(n, float); out[:] = np.nan
        out[order] = q
        return out

    if "p_one_sided" in df_sig.columns:
        df_sig["q_fdr"] = bh_fdr(df_sig["p_one_sided"].fillna(1.0).values)
    else:
        df_sig["q_fdr"] = np.nan

    out_dir = ensure_outdir()
    sig_path = os.path.join(out_dir, "fca_significance.csv")
    df_sig.to_csv(sig_path, index=False, encoding="utf-8")

    # Save top significant pairs (q<=0.05 by default)
    top_sig = df_sig[df_sig["q_fdr"] <= 0.05].copy()
    if top_sig.empty:
        top_sig = df_sig.head(50).copy()  # fallback
    top_sig_path = os.path.join(out_dir, "significant_pairs_topK.csv")
    top_sig.to_csv(top_sig_path, index=False, encoding="utf-8")

    # -------- Case studies: who are the bridge authors on these pairs? --------
    pairs_set = set(map(tuple, top_sig["pair"].values))
    case_rows_all = []
    for y in YEARS:
        for obj in stream_articles(y) or []:
            fields = obj.get("fields") or []
            fields = [str(x).strip() for x in fields if x and str(x).strip()]
            fields = sorted(set(fields))
            if len(fields) < 2: continue
            paper_pairs = set(safe_pair(a,b) for a,b in itertools.combinations(fields,2))
            hits = paper_pairs & pairs_set
            if not hits: continue

            # CPR
            cpr = obj.get("citation_per_reference_ratio")
            if cpr is None:
                ref = obj.get("referenced_works_count") or 1
                cited = obj.get("cited_by_count") or 0
                cpr = float(cited)/float(ref)
            cpr = float(cpr)

            # authors
            aids = []
            for a in obj.get("authors") or []:
                aid = a.get("author_id")
                if aid: aids.append(str(aid))
            aids = list(dict.fromkeys(aids))

            for p in hits:
                for aid in aids:
                    case_rows_all.append({
                        "year": y, "pair": p, "field_i": p[0], "field_j": p[1],
                        "author_id": aid,
                        "is_bridge": (aid in bridge_authors),
                        "CPR": cpr
                    })

    if case_rows_all:
        df_cases_all = pd.DataFrame(case_rows_all)
        # aggregate per pair & author
        g = df_cases_all.groupby(["pair","author_id"], as_index=False).agg(
            n_papers=("year","count"),
            mean_CPR=("CPR","mean"),
            is_bridge=("is_bridge","max")
        )
        # write per-pair CSVs with top bridge authors first
        for pair, sub in g.groupby("pair"):
            sub = sub.sort_values(["is_bridge","n_papers","mean_CPR"], ascending=[False,False,False])
            sub.head(TOP_CASES).to_csv(
                os.path.join(out_dir, "cases", f"case_{pair[0].replace('/','-')}__{pair[1].replace('/','-')}.csv"),
                index=False, encoding="utf-8"
            )
    else:
        df_cases_all = pd.DataFrame()

    # README
    readme = f"""Part E — FCA Significance + Bridge Case Studies (2018–2025)
==================================================================

What this does:
---------------
1) FCA Significance:
   - Builds a year-wise null model by reassigning fields to papers
     (preserving the number of fields per paper and the global field frequencies per year).
   - Recomputes pairwise cosine weights and FCA slopes across years for N={N_NULL} iterations.
   - Reports one-sided p-values (obs slope > null) and BH FDR (q-values).

2) Case Studies:
   - For FCA-significant field pairs (q<=0.05), lists the most active authors in those pairs,
     highlighting which are "bridge authors" (top {int((1-BRIDGE_QUANTILE)*100)}% BES).
   - Includes per-author paper counts and mean CPR for those pair-specific papers.

Key outputs:
------------
- fca_significance.csv        (obs_slope, null_mean/std, z, p_one_sided, q_fdr)
- significant_pairs_topK.csv  (filtered set, default q<=0.05 or top-50 fallback)
- cases/*.csv                 (per-pair author lists: n_papers, mean_CPR, is_bridge)

Notes:
------
- Years used: 2018–2025.
- Null model is approximate; tighten N_NULL for more stable p-values.
- To speed up, LIMIT_PAIRS_TO_FCA=True limits to pairs that already had FCA in Part A.
"""
    with open(os.path.join(out_dir, "README.txt"), "w", encoding="utf-8") as f:
        f.write(readme)

    print("\n✅ Part E complete!")
    print(f"- Significance table: {sig_path}")
    print(f"- Significant pairs:  {top_sig_path}")
    print(f"- Cases folder:       {os.path.join(out_dir, 'cases')}")
    print(f"- Output folder:      {out_dir}")

if __name__ == "__main__":
    main()


[Null] iteration 30/300 done in 9.9s
[Null] iteration 60/300 done in 9.9s
[Null] iteration 90/300 done in 9.3s
[Null] iteration 120/300 done in 9.5s
[Null] iteration 150/300 done in 10.2s
[Null] iteration 180/300 done in 10.1s
[Null] iteration 210/300 done in 10.0s
[Null] iteration 240/300 done in 10.0s
[Null] iteration 270/300 done in 9.8s
[Null] iteration 300/300 done in 9.9s

✅ Part E complete!
- Significance table: e_significance_20250816_134030/fca_significance.csv
- Significant pairs:  e_significance_20250816_134030/significant_pairs_topK.csv
- Cases folder:       e_significance_20250816_134030/cases
- Output folder:      e_significance_20250816_134030
