In [None]:

import os
import json
import math
import itertools
from collections import Counter, defaultdict
from datetime import datetime

import numpy as np
import pandas as pd

YEARS = list(range(2018, 2026))  
INPUT_FOLDER_TEMPLATE = "articles_{year}_new"  
INPUT_FILE_NAME = "all_articles_enhanced.jsonl"

TOP_K_FIELDS_PER_YEAR = 200 

OUTPUT_ROOT = "field_convergence_A"

def read_year_data(year):
    """
    Read the yearly JSONL file for a given year.

    Parameters
    ----------
    year : int
        The publication year to load.

    Returns
    -------
    list of dict
        List of enriched article records for that year.
        Each record corresponds to one paper, including fields and metadata.
    """
    folder = INPUT_FOLDER_TEMPLATE.format(year=year)
    path = os.path.join(folder, INPUT_FILE_NAME)
    if not os.path.exists(path):
        print(f"⚠ קובץ לא נמצא לשנה {year}: {path}")
        return []

    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                items.append(obj)
            except Exception:
                # מסתנן שורות פגומות אם יש
                continue
    return items


def extract_paper_fields(item):
    """
    Extract the list of unique research fields from an article record.

    Parameters
    ----------
    item : dict
        Enriched article object as produced by `enhance_work_data`.

    Returns
    -------
    list of str
        Sorted unique list of field names associated with the article.
    """
    fields = item.get("fields") or []
    fields = [str(x).strip() for x in fields if x and str(x).strip()]
    return sorted(set(fields))


def cosine_weight(co_ij, c_i, c_j, eps=1e-12):
    """
    Compute cosine-normalized co-occurrence weight for a field pair.

    Parameters
    ----------
    co_ij : int
        Number of co-occurrences of fields i and j in the same articles.
    c_i : int
        Number of articles containing field i.
    c_j : int
        Number of articles containing field j.
    eps : float, optional
        Small constant for numerical stability (default=1e-12).

    Returns
    -------
    float
        Cosine-normalized co-occurrence weight.
    """

    denom = math.sqrt(max(c_i, 0) * max(c_j, 0)) + eps
    return co_ij / denom


def linear_slope(xs, ys):
    """
    Estimate the slope of a linear regression line (least squares).

    Parameters
    ----------
    xs : array-like
        Independent variable values (e.g., years).
    ys : array-like
        Dependent variable values (e.g., pairwise weights).

    Returns
    -------
    slope : float
        Estimated slope of the regression line.
    intercept : float
        Estimated intercept of the regression line.
    """
    if len(xs) < 2:
        return float("nan"), float("nan")  # לא מספיק נקודות
    slope, intercept = np.polyfit(xs, ys, 1)
    return float(slope), float(intercept)


def ensure_output_dir():
    """
    Create a timestamped output directory under OUTPUT_ROOT.

    Returns
    -------
    str
        Path to the newly created output directory.
    """

    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_dir = f"{OUTPUT_ROOT}_{ts}"
    os.makedirs(out_dir, exist_ok=True)
    return out_dir


def build_year_field_counts(year_items):
    """
    Count field frequencies and field-pair co-occurrences for a given year.

    Parameters
    ----------
    year_items : list of dict
        List of enriched article records for a given year.

    Returns
    -------
    field_count : collections.Counter
        Number of articles containing each field.
    pair_count : collections.Counter
        Number of articles containing each pair of fields.
    """

    field_count = Counter()
    pair_count = Counter()

    for it in year_items:
        fields = extract_paper_fields(it)
        if len(fields) == 0:
            continue

        field_count.update(fields)

        if len(fields) > 1:
            for a, b in itertools.combinations(sorted(fields), 2):
                pair_count[(a, b)] += 1

    return field_count, pair_count


def limit_to_top_k(field_count, pair_count, top_k):
    """
    Filter to only Top-K most frequent fields in a given year.

    Parameters
    ----------
    field_count : collections.Counter
        Field frequencies for the year.
    pair_count : collections.Counter
        Field-pair co-occurrence frequencies for the year.
    top_k : int or None
        Number of top fields to keep (None = keep all).

    Returns
    -------
    field_count_limited : collections.Counter
        Filtered field counts.
    pair_count_limited : collections.Counter
        Filtered pair counts (only pairs with both fields in top fields).
    """

    if top_k is None:
        return field_count, pair_count

    top_fields = set([f for f, _ in field_count.most_common(top_k)])

    field_count_limited = Counter({f: c for f, c in field_count.items() if f in top_fields})

    pair_count_limited = Counter({(a, b): c for (a, b), c in pair_count.items()
                                  if a in top_fields and b in top_fields})
    return field_count_limited, pair_count_limited


def main():
    """
    Main execution pipeline for Field Convergence Acceleration (FCA).

    Steps:
    ------
    1. For each year (2018–2025):
       - Read enriched article data.
       - Count field and field-pair occurrences.
       - Compute cosine-normalized weights.

    2. Across years:
       - Fit linear slopes for each field pair with ≥3 years of data.
       - Compute FCA (slope) and related statistics.

    3. Save results:
       - `per_year_field_weights.csv`
       - `fca_summary.csv`
       - `top_converging_pairs.csv`
       - `README.txt` with description.

    Notes:
    ------
    - Output is stored in a timestamped folder under OUTPUT_ROOT.
    - Top-K filtering controls scalability.
    """

    out_dir = ensure_output_dir()

    rows_weights = []

    per_year_field_counts = {}
    per_year_pair_counts = {}

    for year in YEARS:
        items = read_year_data(year)
        if not items:
            continue

        field_count, pair_count = build_year_field_counts(items)

        field_count, pair_count = limit_to_top_k(field_count, pair_count, TOP_K_FIELDS_PER_YEAR)

        per_year_field_counts[year] = field_count
        per_year_pair_counts[year] = pair_count

        for (a, b), co_ij in pair_count.items():
            c_i = field_count.get(a, 0)
            c_j = field_count.get(b, 0)
            w = cosine_weight(co_ij, c_i, c_j)
            rows_weights.append({
                "year": year,
                "field_i": a,
                "field_j": b,
                "count_i": c_i,
                "count_j": c_j,
                "coocc": co_ij,
                "weight_cosine": w
            })

    if not rows_weights:
        print("לא נמצאו נתונים לחישוב משקלים. ודאי שהקבצים קיימים במבנה התקיות הנדרש.")
        return

    df_weights = pd.DataFrame(rows_weights)

    fca_rows = []
    for (a, b), sub in df_weights.groupby(["field_i", "field_j"]):
        sub = sub.sort_values("year")
        xs = sub["year"].values.astype(float)
        ys = sub["weight_cosine"].values.astype(float)

        if len(sub) < 3:
            continue

        slope, intercept = linear_slope(xs, ys)

        fca_rows.append({
            "field_i": a,
            "field_j": b,
            "years_covered": len(sub),
            "year_first": int(sub["year"].min()),
            "year_last": int(sub["year"].max()),
            "weight_first": float(sub.iloc[0]["weight_cosine"]),
            "weight_last": float(sub.iloc[-1]["weight_cosine"]),
            "weight_mean": float(sub["weight_cosine"].mean()),
            "FCA_slope": slope,
            "FCA_intercept": intercept
        })

    if not fca_rows:
        return

    df_fca = pd.DataFrame(fca_rows)
    df_top = df_fca.sort_values("FCA_slope", ascending=False).reset_index(drop=True)

    path_weights = os.path.join(out_dir, "per_year_field_weights.csv")
    path_fca = os.path.join(out_dir, "fca_summary.csv")
    path_top = os.path.join(out_dir, "top_converging_pairs.csv")
    df_weights.to_csv(path_weights, index=False, encoding="utf-8")
    df_fca.to_csv(path_fca, index=False, encoding="utf-8")
    df_top.to_csv(path_top, index=False, encoding="utf-8")


if __name__ == "__main__":
    main()
