In [5]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Part C — Novelty mining from Parts A (FCA) and B (BES)

Outputs (saved under novelty_AB_<timestamp>/):
  - link_fca_bes_per_pair.csv
  - impact_top_vs_bottom.csv
  - plots/ (scatter & box/violin)
  - README.txt
"""

import os
import re
import json
import math
import glob
import itertools
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ------------------------- Config ---------------------------------
YEARS = list(range(2018, 2026))             # 2018–2025 inclusive
ARTICLES_DIR_TEMPLATE = "articles_{year}_new"
ARTICLES_FILE = "all_articles_enhanced.jsonl"

# How many pairs to analyze on each side of FCA (top and bottom)
TOP_N_PAIRS = 50

# Quantile to define "bridge authors" by BES
BRIDGE_QUANTILE = 0.90

# Minimum #years a field pair must appear (should match Part A)
MIN_YEARS_PER_PAIR = 3

# ------------------------------------------------------------------

def find_latest_dir(prefix: str):
    """Find latest output dir by prefix (e.g., 'field_convergence_A_')."""
    candidates = [d for d in os.listdir(".") if d.startswith(prefix) and os.path.isdir(d)]
    if not candidates:
        return None
    # sort by timestamp suffix if exists
    candidates.sort(reverse=True)
    return candidates[0]

def safe_pair(a, b):
    """Canonical ordered pair key."""
    return tuple(sorted((a, b)))

def ensure_outdir():
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_dir = f"novelty_AB_{ts}"
    os.makedirs(out_dir, exist_ok=True)
    os.makedirs(os.path.join(out_dir, "plots"), exist_ok=True)
    return out_dir

def load_partA_latest():
    a_dir = find_latest_dir("field_convergence_A_")
    if a_dir is None:
        raise FileNotFoundError("Could not find 'field_convergence_A_*' output dir.")
    fca_path = os.path.join(a_dir, "fca_summary.csv")
    per_year_weights_path = os.path.join(a_dir, "per_year_field_weights.csv")
    if not os.path.exists(fca_path):
        raise FileNotFoundError(f"Missing {fca_path}")
    df_fca = pd.read_csv(fca_path)
    # canonical pair
    df_fca["pair"] = list(map(lambda xy: safe_pair(xy[0], xy[1]), zip(df_fca["field_i"], df_fca["field_j"])))
    # keep only pairs with enough years
    df_fca = df_fca[df_fca["years_covered"] >= MIN_YEARS_PER_PAIR].copy()
    return a_dir, df_fca, per_year_weights_path

def load_partB_latest():
    b_dir = find_latest_dir("bridge_emergence_B_")
    if b_dir is None:
        raise FileNotFoundError("Could not find 'bridge_emergence_B_*' output dir.")
    bes_path = os.path.join(b_dir, "bes_summary.csv")
    per_year_part_path = os.path.join(b_dir, "per_year_author_participation.csv")
    if not os.path.exists(bes_path):
        raise FileNotFoundError(f"Missing {bes_path}")
    df_bes = pd.read_csv(bes_path)
    return b_dir, df_bes, per_year_part_path

def pick_target_pairs(df_fca: pd.DataFrame, top_n=50):
    """Pick top-N and bottom-N FCA pairs."""
    df_sorted = df_fca.sort_values("FCA_slope", ascending=False)
    top = df_sorted.head(top_n).copy()
    bottom = df_sorted.tail(top_n).copy()
    top["set"] = "top"
    bottom["set"] = "bottom"
    both = pd.concat([top, bottom], axis=0, ignore_index=True)
    pairs = set(both["pair"].tolist())
    return both, pairs

def read_article_items_by_year(year):
    folder = ARTICLES_DIR_TEMPLATE.format(year=year)
    path = os.path.join(folder, ARTICLES_FILE)
    if not os.path.exists(path):
        return []
    return path  # we return path and stream line-by-line later

def iter_author_pair_hits(year, target_pairs):
    """
    Yields tuples for authors whose paper contains any of the target field pairs:
      (year, paper_id, pair(tuple), author_id, citation_per_reference_ratio)
    """
    path = read_article_items_by_year(year)
    if not path:
        return
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except Exception:
                continue

            fields = obj.get("fields") or []
            fields = [str(x).strip() for x in fields if x and str(x).strip()]
            fields = sorted(set(fields))
            if len(fields) < 2:
                continue

            # generate all field pairs present in this paper
            paper_pairs = set(safe_pair(a, b) for a, b in itertools.combinations(fields, 2))
            matched_pairs = paper_pairs & target_pairs
            if not matched_pairs:
                continue

            paper_id = obj.get("id")
            cpr = obj.get("citation_per_reference_ratio", None)
            # fall back
            if cpr is None:
                ref = obj.get("referenced_works_count") or 1
                cited = obj.get("cited_by_count") or 0
                cpr = float(cited) / float(ref)

            authors = obj.get("authors") or []
            author_ids = []
            for a in authors:
                aid = a.get("author_id")
                if aid:
                    author_ids.append(aid)
            author_ids = list(dict.fromkeys(author_ids))
            if not author_ids:
                continue

            for p in matched_pairs:
                for aid in author_ids:
                    yield (year, paper_id, p, aid, float(cpr))

def bootstrap_diff_mean(a, b, iters=2000, random_state=42):
    """
    Returns (diff_mean, ci_low, ci_high) using percentile bootstrap (95% CI).
    """
    rng = np.random.default_rng(random_state)
    a = np.asarray(a, dtype=float)
    b = np.asarray(b, dtype=float)
    na, nb = len(a), len(b)
    if na == 0 or nb == 0:
        return np.nan, np.nan, np.nan
    diffs = []
    for _ in range(iters):
        sa = rng.choice(a, size=na, replace=True)
        sb = rng.choice(b, size=nb, replace=True)
        diffs.append(sa.mean() - sb.mean())
    diffs = np.array(diffs)
    return float((a.mean() - b.mean())), float(np.percentile(diffs, 2.5)), float(np.percentile(diffs, 97.5))

def main():
    out_dir = ensure_outdir()
    plots_dir = os.path.join(out_dir, "plots")

    # ---- Load A & B ----
    a_dir, df_fca, _ = load_partA_latest()
    b_dir, df_bes, _ = load_partB_latest()

    # define bridge authors (top quantile by BES_delta)
    thr = df_bes["BES_delta"].quantile(BRIDGE_QUANTILE)
    bridge_authors = set(df_bes.loc[df_bes["BES_delta"] >= thr, "author_id"].astype(str))

    # pick target pairs from FCA
    df_targets, target_pairs = pick_target_pairs(df_fca, top_n=TOP_N_PAIRS)

    # ---- Scan years and accumulate stats ----
    pair_author_total = Counter()       # (#author-labeled hits per pair)
    pair_author_bridge = Counter()      # (#bridge-author hits per pair)
    pair_papers = defaultdict(set)      # paper ids per pair (to aggregate impact once per paper)
    pair_paper_cpr = defaultdict(list)  # list of CPR per pair (unique papers)

    for year in YEARS:
        path = read_article_items_by_year(year)
        if not path:
            continue
        # we will track per (pair, paper) whether added cpr
        seen_pair_paper = set()
        for (yr, pid, pair, aid, cpr) in iter_author_pair_hits(year, target_pairs):
            pair_author_total[pair] += 1
            if str(aid) in bridge_authors:
                pair_author_bridge[pair] += 1

            # impact per paper per pair
            key_pp = (pair, pid)
            if key_pp not in seen_pair_paper:
                seen_pair_paper.add(key_pp)
                pair_papers[pair].add(pid)
                pair_paper_cpr[pair].append(cpr)

    # ---- Build pair-level DataFrame ----
    rows = []
    for pair in target_pairs:
        total = pair_author_total.get(pair, 0)
        bridges = pair_author_bridge.get(pair, 0)
        share = (bridges / total) if total > 0 else np.nan
        n_papers = len(pair_papers.get(pair, set()))
        cprs = pair_paper_cpr.get(pair, [])
        rows.append({
            "field_i": pair[0],
            "field_j": pair[1],
            "pair": pair,
            "authors_total": int(total),
            "authors_bridge": int(bridges),
            "share_bridge_authors": float(share) if share == share else np.nan,
            "papers_count": int(n_papers),
            "cpr_mean": float(np.mean(cprs)) if cprs else np.nan,
            "cpr_median": float(np.median(cprs)) if cprs else np.nan
        })
    df_link = pd.DataFrame(rows)

    # merge FCA slope
    df_link = df_link.merge(
        df_fca[["pair", "FCA_slope", "weight_mean", "year_first", "year_last"]],
        on="pair", how="left"
    )

    # save CSV
    link_path = os.path.join(out_dir, "link_fca_bes_per_pair.csv")
    df_link.sort_values("FCA_slope", ascending=False).to_csv(link_path, index=False, encoding="utf-8")

    # ---- Correlation FCA ↔ share of bridge authors (novelty 1) ----
    corr = df_link[["FCA_slope", "share_bridge_authors"]].corr(method="spearman").iloc[0,1]
    # simple print
    print(f"Spearman(FCA_slope, share_bridge_authors) = {corr:.3f}")

    # scatter plot
    plt.figure(figsize=(7,5))
    plt.scatter(df_link["FCA_slope"], df_link["share_bridge_authors"])
    plt.xlabel("FCA_slope (field convergence acceleration)")
    plt.ylabel("Share of bridge authors (BES top-quantile)")
    plt.title("FCA vs. Bridge-Author Share (per field pair)")
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, "scatter_fca_vs_bridge_share.png"), dpi=140)
    plt.close()

    # ---- Impact: CPR Top-FCA vs Bottom-FCA (novelty 2) ----
    df_top = df_link.sort_values("FCA_slope", ascending=False).head(TOP_N_PAIRS)
    df_bot = df_link.sort_values("FCA_slope", ascending=True).head(TOP_N_PAIRS)

    cpr_top = []
    for p in df_top["pair"]:
        cpr_top.extend(pair_paper_cpr.get(p, []))
    cpr_bot = []
    for p in df_bot["pair"]:
        cpr_bot.extend(pair_paper_cpr.get(p, []))

    diff, ci_lo, ci_hi = bootstrap_diff_mean(cpr_top, cpr_bot, iters=3000, random_state=7)

    impact_rows = [{
        "top_pairs": TOP_N_PAIRS,
        "bottom_pairs": TOP_N_PAIRS,
        "n_papers_top": len(cpr_top),
        "n_papers_bottom": len(cpr_bot),
        "cpr_mean_top": float(np.mean(cpr_top)) if cpr_top else np.nan,
        "cpr_mean_bottom": float(np.mean(cpr_bot)) if cpr_bot else np.nan,
        "mean_diff_top_minus_bottom": diff,
        "bootstrap_95ci_low": ci_lo,
        "bootstrap_95ci_high": ci_hi
    }]
    df_impact = pd.DataFrame(impact_rows)
    impact_path = os.path.join(out_dir, "impact_top_vs_bottom.csv")
    df_impact.to_csv(impact_path, index=False, encoding="utf-8")

    # box/violin
    plt.figure(figsize=(7,5))
    data = [cpr_top, cpr_bot]
    plt.boxplot(data, labels=["Top-FCA pairs", "Bottom-FCA pairs"], showmeans=True)
    plt.ylabel("Citations per Reference (CPR)")
    plt.title("Impact: Top vs Bottom FCA pairs")
    plt.grid(True, axis="y", alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, "box_impact_top_vs_bottom.png"), dpi=140)
    plt.close()

    # optional violin (if you prefer)
    try:
        plt.figure(figsize=(7,5))
        plt.violinplot(data, showmeans=True, showextrema=True)
        plt.xticks([1,2], ["Top-FCA pairs", "Bottom-FCA pairs"])
        plt.ylabel("Citations per Reference (CPR)")
        plt.title("Impact: Top vs Bottom FCA pairs (Violin)")
        plt.grid(True, axis="y", alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(plots_dir, "violin_impact_top_vs_bottom.png"), dpi=140)
        plt.close()
    except Exception:
        pass

    # ---- README with summary ----
    readme = f"""Novelty Mining from Parts A & B
====================================

This folder aggregates cross-evidence between:
  - Part A (FCA: field convergence acceleration), and
  - Part B (BES: bridge emergence for authors).

Key outputs:
------------
1) link_fca_bes_per_pair.csv
   Per field pair:
     * FCA_slope
     * authors_total, authors_bridge, share_bridge_authors
     * papers_count
     * CPR statistics (mean/median)
     * coverage years

2) impact_top_vs_bottom.csv
   CPR comparison between Top-{TOP_N_PAIRS} FCA pairs and Bottom-{TOP_N_PAIRS} FCA pairs,
   including bootstrap 95% CI for the difference in means.

3) plots/
   - scatter_fca_vs_bridge_share.png
   - box_impact_top_vs_bottom.png
   - violin_impact_top_vs_bottom.png

Headline stats:
---------------
- Spearman(FCA_slope, share_bridge_authors) = {corr:.3f}
- CPR mean difference (Top - Bottom) = {diff:.4f}
  95% CI: [{ci_lo:.4f}, {ci_hi:.4f}]

Interpretation guide:
---------------------
- A positive Spearman correlation indicates that field pairs with faster convergence
  tend to involve a larger share of bridge authors (BES top-quantile).
- A positive CPR difference (Top - Bottom) suggests that papers on fast-converging pairs
  are associated with higher impact (citations per reference).
"""
    with open(os.path.join(out_dir, "README.txt"), "w", encoding="utf-8") as f:
        f.write(readme)

    print("\n✅ Done Part C!")
    print(f"- Saved: {link_path}")
    print(f"- Saved: {impact_path}")
    print(f"- Plots: {plots_dir}")
    print(f"- Output folder: {out_dir}")

if __name__ == "__main__":
    main()


Spearman(FCA_slope, share_bridge_authors) = -0.083


  plt.boxplot(data, labels=["Top-FCA pairs", "Bottom-FCA pairs"], showmeans=True)



✅ Done Part C!
- Saved: novelty_AB_20250816_124944/link_fca_bes_per_pair.csv
- Saved: novelty_AB_20250816_124944/impact_top_vs_bottom.csv
- Plots: novelty_AB_20250816_124944/plots
- Output folder: novelty_AB_20250816_124944
