In [None]:
import json
from pathlib import Path
from typing import Dict, List, Any, Iterable, Tuple
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt

### Suamry annoations of papers

In [None]:
# download_arxiv_from_jsonl.py
import gzip, time, re, csv, sys
from urllib.request import Request, urlopen, URLError, HTTPError

# ---- CONFIG (edit these) ----
JSONL_PATH = Path(r"C:\Users\Adam\Documents\idp\arxiv\arxiv-metadata-oai-snapshot.json")  # or .json.gz
OUT_DIR  = Path("C:\\Users\\Adam\\Documents\\idp\\arxiv\\")
MANIFEST   = Path(r"C:\Users\Adam\Documents\idp\arxiv\manifest.csv")

MAX_PAPERS     = 1000     # set None for all (huge)
SUBJECT_FILTER = None     # e.g., "cs.CL", or "cs." to capture a family, or None
RATE_SECONDS   = 0.6      # be polite to arXiv
RETRIES        = 3        # per file
TIMEOUT        = 60       # seconds

# -----------------------------
OUT_DIR.mkdir(parents=True, exist_ok=True)

def safe_filename(s: str) -> str:
    return re.sub(r'[^a-zA-Z0-9._-]+', '_', s)

def pdf_url_from_id(arxiv_id: str) -> str:
    # Works for both old-style (e.g., "supr-con/9609004") and modern ("2401.01234")
    arxiv_id = arxiv_id.strip()
    return f"https://arxiv.org/pdf/{arxiv_id}.pdf"

def primary_category(categories: str) -> str:
    # Take first whitespace-separated token as the "primary"
    return (categories or "").split()[0] if categories else ""

def open_lines(path: Path):
    if path.suffix.lower() == ".gz":
        return gzip.open(path, "rt", encoding="utf-8", errors="ignore")
    return open(path, "rt", encoding="utf-8", errors="ignore")

downloaded = 0
skipped = 0
errors = 0

# prepare manifest writer
write_header = not MANIFEST.exists()
mf = open(MANIFEST, "a", newline="", encoding="utf-8")
w = csv.DictWriter(mf, fieldnames=[
    "filename","arxiv_id","pdf_url","primary_category","categories","status"
])
if write_header:
    w.writeheader()

with open_lines(JSONL_PATH) as f:
    for line in f:
        if not line.strip():
            continue
        try:
            row = json.loads(line)
        except Exception:
            errors += 1
            continue

        arx_id = (row.get("id") or row.get("paper_id") or "").strip()
        cats   = row.get("categories","")
        if not arx_id:
            skipped += 1
            continue

        if SUBJECT_FILTER and SUBJECT_FILTER not in cats:
            skipped += 1
            continue

        url = pdf_url_from_id(arx_id)
        out_file = OUT_DIR / f"{safe_filename(arx_id)}.pdf"
        if out_file.exists() and out_file.stat().st_size > 0:
            w.writerow({
                "filename": out_file.name,
                "arxiv_id": arx_id,
                "pdf_url": url,
                "primary_category": primary_category(cats),
                "categories": cats,
                "status": "exists"
            })
            skipped += 1
            continue

        # download with simple retries
        ok = False
        for attempt in range(1, RETRIES+1):
            try:
                req = Request(url, headers={"User-Agent": "arxiv-downloader/1.0"})
                with urlopen(req, timeout=TIMEOUT) as resp, open(out_file, "wb") as wout:
                    wout.write(resp.read())
                ok = True
                break
            except (HTTPError, URLError, TimeoutError) as e:
                if attempt == RETRIES:
                    sys.stderr.write(f"ERROR {arx_id}: {e}\n")
                time.sleep(1.0 * attempt)  # backoff

        if ok:
            w.writerow({
                "filename": out_file.name,
                "arxiv_id": arx_id,
                "pdf_url": url,
                "primary_category": primary_category(cats),
                "categories": cats,
                "status": "downloaded"
            })
            downloaded += 1
        else:
            w.writerow({
                "filename": out_file.name,
                "arxiv_id": arx_id,
                "pdf_url": url,
                "primary_category": primary_category(cats),
                "categories": cats,
                "status": "error"
            })
            errors += 1

        if MAX_PAPERS and downloaded >= MAX_PAPERS:
            break

        time.sleep(RATE_SECONDS)

mf.close()
print(f"done: downloaded={downloaded} skipped={skipped} errors={errors} → {OUT_DIR}")
print(f"manifest: {MANIFEST}")


### Randomly merge docuemnts to test seperation model