In [None]:
import os
import json
import itertools
from datetime import datetime
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


ART_DIR_TMPL = "articles_{year}_new"
ART_FILE = "all_articles_enhanced.jsonl"
YEARS = list(range(2018, 2026))  # 2018–2025 inclusive


def find_latest_dir(prefix: str) -> str | None:
    """
    Return the lexicographically latest directory that starts with `prefix`.

    Parameters
    ----------
    prefix : str
        Directory name prefix, e.g., "field_convergence_A_".

    Returns
    -------
    str | None
        Latest matching directory, or None if none exists.
    """
    cands = [d for d in os.listdir(".") if d.startswith(prefix) and os.path.isdir(d)]
    if not cands:
        return None
    cands.sort(reverse=True)
    return cands[0]


def ensure_outdir() -> str:
    """
    Create a timestamped output directory for figures.

    Returns
    -------
    str
        Path to the created directory, e.g., "paper_figures_20250815_162920".
    """
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_dir = f"paper_figures_{ts}"
    os.makedirs(out_dir, exist_ok=True)
    return out_dir


def safe_pair(a, b) -> tuple[str, str]:
    """
    Return a canonical (sorted) pair of field names as strings.

    Parameters
    ----------
    a, b : Any
        Field names (or objects convertible to str).

    Returns
    -------
    tuple[str, str]
        Sorted pair (min(a,b), max(a,b)) by string order.
    """
    return tuple(sorted((str(a), str(b))))



def plot_fca_heatmap(out_dir: str) -> str | None:
    """
    Build and save a heatmap of FCA slopes across field pairs.

    Expects:
      field_convergence_A_*/fca_summary.csv

    Saves:
      out_dir/fca_heatmap_pairs.png

    Returns
    -------
    str | None
        Output image path or None if inputs missing.
    """
    a_dir = find_latest_dir("field_convergence_A_")
    if not a_dir:
        return None

    fpath = os.path.join(a_dir, "fca_summary.csv")
    if not os.path.exists(fpath):
        return None

    df = pd.read_csv(fpath)
    if {"field_i", "field_j", "FCA_slope"} - set(df.columns):
        return None

    # Pivot to a matrix [field_i × field_j] of slopes; mirror for symmetry if needed
    pivot = df.pivot(index="field_i", columns="field_j", values="FCA_slope")
    pivot = pivot.combine_first(pivot.T)

    # Color scale centered at 0
    vmax = np.nanmax(np.abs(pivot.values))
    plt.figure(figsize=(14, 12))
    im = plt.imshow(
        pivot.values,
        aspect="auto",
        cmap="coolwarm",
        vmin=-vmax,
        vmax=vmax
    )
    plt.colorbar(im, label="FCA slope")
    plt.xticks(range(len(pivot.columns)), pivot.columns, rotation=90, fontsize=8)
    plt.yticks(range(len(pivot.index)), pivot.index, fontsize=8)
    plt.title("Field Convergence Acceleration (FCA) across field pairs", fontsize=14)
    plt.xlabel("Field j")
    plt.ylabel("Field i")
    plt.tight_layout()

    out = os.path.join(out_dir, "fca_heatmap_pairs.png")
    plt.savefig(out, dpi=300)
    plt.close()
    return out



def plot_top_pair_trends(out_dir: str, top_k: int = 6) -> list[str]:
    """
    Plot time-series of cosine co-occurrence for the top FCA pairs.

    Expects:
      field_convergence_A_*/fca_summary.csv
      field_convergence_A_*/per_year_field_weights.csv

    Saves:
      out_dir/pair_trend_<i>__<j>.png  (up to `top_k` pairs)

    Parameters
    ----------
    out_dir : str
        Output directory for images.
    top_k : int, default=6
        Number of top FCA pairs to plot.

    Returns
    -------
    list[str]
        List of saved file paths.
    """
    a_dir = find_latest_dir("field_convergence_A_")
    if not a_dir:
        return []

    fca_path = os.path.join(a_dir, "fca_summary.csv")
    wy_path = os.path.join(a_dir, "per_year_field_weights.csv")
    if not (os.path.exists(fca_path) and os.path.exists(wy_path)):
        return []

    df_fca = pd.read_csv(fca_path)
    df_wy = pd.read_csv(wy_path)

    req1 = {"field_i", "field_j", "FCA_slope"} - set(df_fca.columns)
    req2 = {"year", "field_i", "field_j", "weight_cosine"} - set(df_wy.columns)
    if req1 or req2:
        return []

    df_fca = df_fca.sort_values("FCA_slope", ascending=False).head(top_k).copy()
    df_wy["pair"] = list(map(lambda xy: safe_pair(xy[0], xy[1]), zip(df_wy["field_i"], df_wy["field_j"])))

    outs = []
    for _, r in df_fca.iterrows():
        pair = safe_pair(r["field_i"], r["field_j"])
        sub = (
            df_wy[df_wy["pair"] == pair]
            .groupby("year", as_index=False)["weight_cosine"]
            .mean()
            .sort_values("year")
        )
        if sub.empty:
            continue

        plt.figure(figsize=(6.4, 4.2))
        plt.plot(sub["year"], sub["weight_cosine"], marker="o")
        plt.grid(alpha=.3)
        plt.xlabel("Year")
        plt.ylabel("Cosine weight")
        plt.title(f"Trend: {pair[0]} × {pair[1]}  (FCA={r['FCA_slope']:.4f})")
        plt.tight_layout()

        fn = f"pair_trend_{pair[0].replace('/','-')}__{pair[1].replace('/','-')}.png"
        out = os.path.join(out_dir, fn)
        plt.savefig(out, dpi=220)
        plt.close()
        outs.append(out)

    return outs



def plot_bes_distribution(out_dir: str) -> str | None:
    """
    Plot a histogram of Bridge Emergence Score (BESΔ) across authors.

    Expects:
      bridge_emergence_B_*/bes_summary.csv

    Saves:
      out_dir/bes_distribution.png

    Parameters
    ----------
    out_dir : str
        Output directory.

    Returns
    -------
    str | None
        Output file path or None if inputs missing.
    """
    b_dir = find_latest_dir("bridge_emergence_B_")
    if not b_dir:
        return None

    bes_path = os.path.join(b_dir, "bes_summary.csv")
    if not os.path.exists(bes_path):
        return None

    df = pd.read_csv(bes_path)
    if "BES_delta" not in df.columns:
        return None

    x = pd.to_numeric(df["BES_delta"], errors="coerce").dropna().values
    if x.size == 0:
        return None

    plt.figure(figsize=(7, 5))
    plt.hist(x, bins=50)
    plt.axvline(0, color="k", linewidth=1)
    plt.title("Distribution of Bridge Emergence Score (BESΔ)")
    plt.xlabel("BES (P_last − P_first)")
    plt.ylabel("Authors")
    plt.tight_layout()

    out = os.path.join(out_dir, "bes_distribution.png")
    plt.savefig(out, dpi=220)
    plt.close()
    print(f"✅ saved {out}")
    return out

def stream_articles(year: int):
    """
    Yield enriched article JSON objects for a given year.

    Parameters
    ----------
    year : int
        Year to stream.

    Yields
    ------
    dict
        Parsed JSON records from the year's JSONL file.
    """
    path = os.path.join(ART_DIR_TMPL.format(year=year), ART_FILE)
    if not os.path.exists(path):
        return
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except Exception:
                continue


def rebuild_cpr_lists_from_articles(pairs: dict[str, list[tuple[str, str]]]) -> dict[str, list[float]]:
    """
    Rebuild CPR lists for Top/Bottom FCA pairs by scanning yearly article JSONLs.

    Parameters
    ----------
    pairs : dict
        pairs["top"]   -> list of (field_i, field_j) tuples
        pairs["bottom"]-> list of (field_i, field_j) tuples

    Returns
    -------
    dict
        {"top": [cpr...], "bottom": [cpr...]}
    """
    pair_sets = {"top": set(pairs.get("top", [])), "bottom": set(pairs.get("bottom", []))}
    out = {"top": [], "bottom": []}

    for y in YEARS:
        for obj in stream_articles(y) or []:
            fields = obj.get("fields") or []
            fields = sorted(set(str(x).strip() for x in fields if str(x).strip()))
            if len(fields) < 2:
                continue

            paper_pairs = set(safe_pair(a, b) for a, b in itertools.combinations(fields, 2))

            cpr = obj.get("citation_per_reference_ratio")
            if cpr is None:
                ref = obj.get("referenced_works_count") or 1
                cited = obj.get("cited_by_count") or 0
                cpr = float(cited) / float(ref)

            for label in ("top", "bottom"):
                if paper_pairs & pair_sets[label]:
                    out[label].append(float(cpr))
    return out


def plots_from_partC(out_dir: str, top_n_pairs: int = 50) -> list[str]:
    """
    Generate Part C figures (scatter + boxplots). Copies from Part C if available,
    otherwise computes from CSVs / articles as needed.

    Saves:
      out_dir/scatter_fca_vs_bridge_share.png
      out_dir/box_bridge_share_top_vs_bottom.png
      out_dir/box_cpr_top_vs_bottom.png

    Parameters
    ----------
    out_dir : str
        Output directory.
    top_n_pairs : int, default=50
        Number of top & bottom FCA pairs to compare.

    Returns
    -------
    list[str]
        Paths of images that were created/copied.
    """
    created = []
    c_dir = find_latest_dir("novelty_AB_")
    a_dir = find_latest_dir("field_convergence_A_")

    # (4) Scatter: FCA vs bridge share
    scatter_src = os.path.join(c_dir, "plots", "scatter_fca_vs_bridge_share.png") if c_dir else None
    if scatter_src and os.path.exists(scatter_src):
        import shutil
        out = os.path.join(out_dir, "scatter_fca_vs_bridge_share.png")
        shutil.copy2(scatter_src, out)
        created.append(out)
    else:
        link_path = os.path.join(c_dir, "link_fca_bes_per_pair.csv") if c_dir else None
        if link_path and os.path.exists(link_path):
            df = pd.read_csv(link_path)
            if {"FCA_slope", "share_bridge_authors"} <= set(df.columns):
                plt.figure(figsize=(7, 5))
                plt.scatter(df["FCA_slope"], df["share_bridge_authors"])
                plt.grid(alpha=.3)
                plt.xlabel("FCA_slope")
                plt.ylabel("Bridge-author share")
                plt.title("FCA vs Bridge-author share (per pair)")
                plt.tight_layout()
                out = os.path.join(out_dir, "scatter_fca_vs_bridge_share.png")
                plt.savefig(out, dpi=200)
                plt.close()
                created.append(out)
                print(f"✅ saved {out}")

    # (5) Boxplot: Bridge share (Top vs Bottom FCA)
    link_path = os.path.join(c_dir, "link_fca_bes_per_pair.csv") if c_dir else None
    if link_path and os.path.exists(link_path):
        df = pd.read_csv(link_path)
        if {"FCA_slope", "share_bridge_authors"} <= set(df.columns):
            df = df.dropna(subset=["FCA_slope", "share_bridge_authors"])
            df = df.sort_values("FCA_slope", ascending=False)
            top = df.head(top_n_pairs)["share_bridge_authors"].values
            bot = df.tail(top_n_pairs)["share_bridge_authors"].values

            plt.figure(figsize=(6.6, 4.8))
            plt.boxplot([top, bot], labels=["Top-FCA pairs", "Bottom-FCA pairs"], showmeans=True)
            plt.ylabel("Bridge-author share")
            plt.title("Bridge share: Top vs Bottom FCA")
            plt.grid(axis="y", alpha=.3)
            plt.tight_layout()

            out = os.path.join(out_dir, "box_bridge_share_top_vs_bottom.png")
            plt.savefig(out, dpi=200)
            plt.close()
            created.append(out)

    # (6) Boxplot: CPR (Top vs Bottom FCA)
    # Prefer copying from Part C if available
    box_src = os.path.join(c_dir, "plots", "box_impact_top_vs_bottom.png") if c_dir else None
    if box_src and os.path.exists(box_src):
        import shutil
        out = os.path.join(out_dir, "box_cpr_top_vs_bottom.png")
        shutil.copy2(box_src, out)
        created.append(out)
    else:
        # Rebuild CPR lists from Part A pairs + article JSONLs
        if a_dir:
            fca_path = os.path.join(a_dir, "fca_summary.csv")
            if os.path.exists(fca_path):
                df_fca = pd.read_csv(fca_path).sort_values("FCA_slope", ascending=False)
                if {"field_i", "field_j", "FCA_slope"} <= set(df_fca.columns):
                    top_pairs = [safe_pair(i, j) for i, j in df_fca.head(top_n_pairs)[["field_i", "field_j"]].values]
                    bot_pairs = [safe_pair(i, j) for i, j in df_fca.tail(top_n_pairs)[["field_i", "field_j"]].values]
                    cpr_lists = rebuild_cpr_lists_from_articles({"top": top_pairs, "bottom": bot_pairs})

                    plt.figure(figsize=(6.6, 4.8))
                    plt.boxplot(
                        [cpr_lists.get("top", []), cpr_lists.get("bottom", [])],
                        labels=["Top-FCA pairs", "Bottom-FCA pairs"],
                        showmeans=True
                    )
                    plt.ylabel("Citations per Reference (CPR)")
                    plt.title("Impact: Top vs Bottom FCA")
                    plt.grid(axis="y", alpha=.3)
                    plt.tight_layout()

                    out = os.path.join(out_dir, "box_cpr_top_vs_bottom.png")
                    plt.savefig(out, dpi=200)
                    plt.close()
                    created.append(out)
                    print(f"✅ saved {out}")

    return created



def plot_pvals_histogram(out_dir: str) -> str | None:
    """
    Plot histograms of p-values and FDR q-values from the null-model test (Part E).

    Expects:
      e_significance_*/fca_significance.csv
      with columns 'p_one_sided' and 'q_fdr'.

    Saves:
      out_dir/pvals_histogram_E.png

    Parameters
    ----------
    out_dir : str
        Output directory.

    Returns
    -------
    str | None
        Output file path or None if inputs missing.
    """
    e_dir = find_latest_dir("e_significance_")
    if not e_dir:
        return None

    sig_path = os.path.join(e_dir, "fca_significance.csv")
    if not os.path.exists(sig_path):
        return None

    df = pd.read_csv(sig_path)
    p = pd.to_numeric(df.get("p_one_sided"), errors="coerce")
    q = pd.to_numeric(df.get("q_fdr"), errors="coerce")

    plt.figure(figsize=(12, 4.6))
    plt.subplot(1, 2, 1)
    plt.hist(p.dropna(), bins=40)
    plt.title("Null-model p-values (one-sided)")
    plt.xlabel("p")
    plt.ylabel("#pairs")

    plt.subplot(1, 2, 2)
    plt.hist(q.dropna(), bins=40)
    plt.title("FDR q-values (BH)")
    plt.xlabel("q")
    plt.ylabel("#pairs")

    plt.tight_layout()
    out = os.path.join(out_dir, "pvals_histogram_E.png")
    plt.savefig(out, dpi=200)
    plt.close()
    return out



def main():
    """
    Run the full figure generation pipeline.

    Creates a timestamped output directory and attempts to produce each figure:
      - Fig 1: FCA heatmap (A)
      - Fig 2: Top FCA pair trends (A)
      - Fig 3: BES distribution (B)
      - Figs 4–6: Part C plots (copy or fallback)
      - Fig 7: p/q histograms (E)
    """
    out_dir = ensure_outdir()
    print(f"\n📁 Output: {out_dir}\n")

    plot_fca_heatmap(out_dir)                    # Fig 1
    plot_top_pair_trends(out_dir, top_k=6)       # Fig 2
    plot_bes_distribution(out_dir)               # Fig 3
    plots_from_partC(out_dir, top_n_pairs=50)    # Figs 4–6
    plot_pvals_histogram(out_dir)                # Fig 7



if __name__ == "__main__":
    main()
