In [None]:
import json
from pathlib import Path
from typing import Dict, List, Any, Iterable, Tuple
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt

### Suamry annoations of papers

In [None]:
# link_pdfs_to_categories.py
import json, gzip, csv, re
from pathlib import Path

# --- EDIT THESE ---
PDF_ROOT   = Path(r"C:\Users\Adam\Documents\idp\arxiv_pdf")  # e.g., "/Users/you/Downloads/arxiv/pdf"
META_PATH  = Path(r"C:\Users\Adam\Documents\idp\arxiv\arxiv-metadata-oai-snapshot.json")  # or .json
MANIFEST   = Path(r"C:\Users\Adam\Documents\idp\arxiv\manifest.csv")
# -------------------

def open_lines(p: Path):
    p = Path(p)
    if str(p).lower().endswith(".gz"):
        return gzip.open(p, "rt", encoding="utf-8", errors="ignore")
    return open(p, "rt", encoding="utf-8", errors="ignore")

def primary_category(categories: str) -> str:
    cats = (categories or "").split()
    # pick first "modern" cat with a dot; else fall back to first token
    for c in cats:
        if "." in c:
            return c
    return cats[0] if cats else ""

def load_id_to_cats(meta_path: Path):
    id2cats = {}
    with open_lines(meta_path) as f:
        for line in f:
            if not line.strip(): continue
            row = json.loads(line)
            arx_id = (row.get("id") or row.get("paper_id") or "").strip()
            if not arx_id:
                continue
            id2cats[arx_id] = row.get("categories","")
    return id2cats

def arxiv_id_from_path(pdf_path: Path, pdf_root: Path) -> str:
    """
    Reconstruct arXiv ID from the local PDF path under pdf_root.

    Handles:
      - modern layout: pdf/2401/2401.01234.pdf -> '2401.01234'
      - legacy layout: pdf/cond-mat/9609001.pdf -> 'cond-mat/9609001'
      - legacy w/ deeper trees (rare): join subdirs (except extension)
    """
    rel = pdf_path.relative_to(pdf_root)
    parts = rel.parts
    stem = pdf_path.stem  # filename without .pdf

    stem = re.sub(r'v\d+$', '', stem)

    # modern IDs: folder like '2401/2401.01234.pdf' (first part is 4-digit year+month)
    if len(parts) >= 2 and len(parts[0]) == 4 and parts[0].isdigit():
        return stem  # '2401.01234'

    # legacy IDs: 'archive/number.pdf' -> 'archive/number'
    if len(parts) >= 2:
        # rebuild 'archive/number' (or deeper legacy variants) without .pdf
        # e.g., 'cond-mat/9609001' or 'hep-th/9901001'
        without_ext = Path(*parts).as_posix()[:-4]  # strip .pdf
        return without_ext

    # fallback: single file under root -> assume modern id equals stem
    return stem

def build_manifest(pdf_root: Path, id2cats: dict, out_csv: Path):
    out_csv.parent.mkdir(parents=True, exist_ok=True)
    found = missing = 0
    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=[
            "pdf_path","arxiv_id","primary_category","categories","matched_metadata"
        ])
        w.writeheader()

        for pdf in pdf_root.rglob("*.pdf"):
            arx_id = arxiv_id_from_path(pdf, pdf_root)
            cats   = id2cats.get(arx_id, "")
            pcat   = primary_category(cats)
            matched = "yes" if cats else "no"
            if matched == "yes":
                found += 1
            else:
                missing += 1
            w.writerow({
                "pdf_path": str(pdf),
                "arxiv_id": arx_id,
                "primary_category": pcat,
                "categories": cats,
                "matched_metadata": matched
            })
    return found, missing

def main():
    print("Loading metadata…")
    id2cats = load_id_to_cats(META_PATH)
    print(f"Metadata IDs loaded: {len(id2cats):,}")

    print("Linking local PDFs to metadata…")
    found, missing = build_manifest(PDF_ROOT, id2cats, MANIFEST)
    print(f"Done. matched={found:,}  missing={missing:,}")
    print(f"Manifest → {MANIFEST}")

if __name__ == "__main__":
    # Quick sanity checks
    assert PDF_ROOT.exists(), f"PDF_ROOT not found: {PDF_ROOT}"
    assert META_PATH.exists(), f"META_PATH not found: {META_PATH}"
    main()


### Randomly merge docuemnts to test seperation model