In [None]:

import os
import json
from collections import defaultdict, Counter
from datetime import datetime

import numpy as np
import pandas as pd
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities

YEARS = list(range(2018, 2026))
INPUT_FOLDER_TEMPLATE = "articles_{year}_new"     
INPUT_FILE_NAME = "all_articles_enhanced.jsonl"  
OUTPUT_ROOT = "bridge_emergence_B"

MIN_DEGREE_PER_YEAR = 1

MIN_PAPERS_PER_AUTHOR_PER_YEAR = 1  

def read_year_items(year):
    """
    Load the JSONL file for a given year.

    Parameters
    ----------
    year : int
        Publication year.

    Returns
    -------
    list of dict
        List of enriched article records for that year.
    """

    folder = INPUT_FOLDER_TEMPLATE.format(year=year)
    path = os.path.join(folder, INPUT_FILE_NAME)
    if not os.path.exists(path):
        print(f"⚠ File not found for year {year}: {path}")
        return []

    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                items.append(obj)
            except Exception:
                continue
    return items


def extract_authors_from_item(item):
    """
    Extract a list of author IDs from an article record.

    Parameters
    ----------
    item : dict
        Enriched article object containing an 'authors' field.

    Returns
    -------
    list of str
        Unique list of author IDs for this article.
    """
    authors = []
    for a in item.get("authors", []) or []:
        aid = a.get("author_id")
        if aid:
            authors.append(aid)
    authors = list(dict.fromkeys(authors))
    return authors


def build_coauthor_graph(year_items):
    """
    Build a co-authorship graph for a given year's articles.

    Nodes = authors
    Edges = co-authorships in that year (weighted by number of shared papers)

    Parameters
    ----------
    year_items : list of dict
        List of enriched article records.

    Returns
    -------
    G : networkx.Graph
        Undirected weighted graph of authors.
    author_paper_count : collections.Counter
        Number of papers per author in that year.
    """
    G = nx.Graph()
    author_paper_count = Counter()

    for it in year_items:
        authors = extract_authors_from_item(it)
        if not authors or len(authors) == 1:
            # גם יחיד: נרשום קיום מחבר במערכת, כדי לא לאבדו
            for a in authors:
                author_paper_count[a] += 1
                if a not in G:
                    G.add_node(a)
            continue

        for a in authors:
            author_paper_count[a] += 1
            if a not in G:
                G.add_node(a)

        for i in range(len(authors)):
            for j in range(i + 1, len(authors)):
                u, v = authors[i], authors[j]
                if G.has_edge(u, v):
                    G[u][v]['weight'] += 1
                else:
                    G.add_edge(u, v, weight=1)

    return G, author_paper_count


def participation_coefficient(G, communities):
    """
    Compute the participation coefficient for each author in a given graph.

    Parameters
    ----------
    G : networkx.Graph
        Co-authorship graph for a given year.
    communities : list of set
        List of sets, each set representing a detected community of nodes.

    Returns
    -------
    P : dict
        Mapping from author_id -> participation coefficient [0,1].
    comm_id_map : dict
        Mapping from author_id -> community ID.
    """
    comm_id_map = {}
    for idx, comm in enumerate(communities):
        for node in comm:
            comm_id_map[node] = idx

    P = {}
    for a in G.nodes():
        k_a = G.degree(a, weight=None)  
        if k_a == 0:
            P[a] = np.nan 
            continue

        neighbor_comm_counts = Counter()
        for nb in G.neighbors(a):
            c = comm_id_map.get(nb, -1)
            neighbor_comm_counts[c] += 1

        s = 0.0
        for c, k_ac in neighbor_comm_counts.items():
            frac = k_ac / k_a
            s += frac * frac
        P[a] = 1.0 - s

    return P, comm_id_map


def ensure_output_dir():
    """
    Create a timestamped output directory under OUTPUT_ROOT.

    Returns
    -------
    str
        Path to created output directory.
    """

    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_dir = f"{OUTPUT_ROOT}_{ts}"
    os.makedirs(out_dir, exist_ok=True)
    return out_dir


def main():
    """
    Main pipeline for computing Bridge Emergence Score (BES).

    Steps:
    ------
    1. For each year:
       - Build co-author graph.
       - Filter authors by minimum paper and degree thresholds.
       - Detect communities (Greedy Modularity).
       - Compute participation coefficient for each author.
       - Store per-year results.

    2. For each author across years:
       - Identify valid active years (degree >= MIN_DEGREE_PER_YEAR).
       - Compute BES = P_last - P_first.

    3. Save results:
       - per_year_author_participation.csv
       - bes_summary.csv
       - top_emerging_bridges.csv
       - README.txt (English description).
    """

    out_dir = ensure_output_dir()

    yearly_rows = [] 
    authors_year_P = defaultdict(dict)  
    authors_year_deg = defaultdict(dict)  
    authors_year_papers = defaultdict(dict)  

    for year in YEARS:
        items = read_year_items(year)
        if not items:
            continue

        G, papers_cnt = build_coauthor_graph(items)

        keep_nodes = {a for a, c in papers_cnt.items() if c >= MIN_PAPERS_PER_AUTHOR_PER_YEAR}
        if keep_nodes:
            G = G.subgraph(keep_nodes).copy()
            papers_cnt = Counter({a: c for a, c in papers_cnt.items() if a in keep_nodes})

        if G.number_of_nodes() == 0:
            continue

        communities = list(greedy_modularity_communities(G))

        P_map, comm_id_map = participation_coefficient(G, communities)

        deg_map = dict(G.degree(weight=None))

        for a in G.nodes():
            P = P_map.get(a, np.nan)
            deg = deg_map.get(a, 0)
            papers = papers_cnt.get(a, 0)

            if deg < MIN_DEGREE_PER_YEAR:
                pass

            yearly_rows.append({
                "year": year,
                "author_id": a,
                "degree": deg,
                "papers": papers,
                "participation": P,
                "community_id": comm_id_map.get(a, -1),
                "num_communities_year": len(communities),
                "num_nodes_year": G.number_of_nodes(),
                "num_edges_year": G.number_of_edges()
            })
            authors_year_P[a][year] = P
            authors_year_deg[a][year] = deg
            authors_year_papers[a][year] = papers

    if not yearly_rows:
        print("No yearly rows produced. Check input folders/files.")
        return

    df_yearly = pd.DataFrame(yearly_rows)
    df_yearly = df_yearly.sort_values(["author_id", "year"]).reset_index(drop=True)

    bes_rows = []
    for a, year_to_P in authors_year_P.items():
        valid_years = []
        for y, P in year_to_P.items():
            deg = authors_year_deg[a].get(y, 0)
            if deg >= MIN_DEGREE_PER_YEAR and P is not None and not (isinstance(P, float) and np.isnan(P)):
                valid_years.append(y)
        if len(valid_years) < 2:
            continue

        ys = sorted(valid_years)
        first_y, last_y = ys[0], ys[-1]
        P_first = year_to_P[first_y]
        P_last  = year_to_P[last_y]

        bes = float(P_last - P_first)

        bes_rows.append({
            "author_id": a,
            "year_first": int(first_y),
            "year_last": int(last_y),
            "P_first": float(P_first),
            "P_last": float(P_last),
            "BES_delta": bes,
            "years_active": len(ys),
            "mean_degree_active_years": float(np.mean([authors_year_deg[a][y] for y in ys])),
            "mean_papers_active_years": float(np.mean([authors_year_papers[a][y] for y in ys]))
        })

    if not bes_rows:
        print("No authors with ≥2 valid years. Consider lowering thresholds.")
        return

    df_bes = pd.DataFrame(bes_rows)
    df_bes = df_bes.sort_values("BES_delta", ascending=False).reset_index(drop=True)

    # שמירות
    per_year_path = os.path.join(out_dir, "per_year_author_participation.csv")
    bes_path = os.path.join(out_dir, "bes_summary.csv")
    top_path = os.path.join(out_dir, "top_emerging_bridges.csv")

    df_yearly.to_csv(per_year_path, index=False, encoding="utf-8")
    df_bes.to_csv(bes_path, index=False, encoding="utf-8")
    # Top emerging bridges (לפי BES_delta)
    df_bes.head(1000).to_csv(top_path, index=False, encoding="utf-8")


if __name__ == "__main__":
    main()
