In [None]:
import calendar, time, requests, feedparser, pandas as pd
from pathlib import Path

OUT = Path("../data/raw/physics_condmat_2005_2025.csv")

def fetch_arxiv(query, start=0, max_results=1000, sortBy="submittedDate", sortOrder="ascending"):
    url = "https://export.arxiv.org/api/query"
    params = {
        "search_query": query,
        "start": start,
        "max_results": max_results,
        "sortBy": sortBy,
        "sortOrder": sortOrder,
    }
    # simple retry
    for attempt in range(4):
        try:
            r = requests.get(url, params=params, timeout=30)
            r.raise_for_status()
            return feedparser.parse(r.text)
        except Exception as e:
            if attempt == 3:
                raise
            time.sleep(2 * (attempt + 1))  # backoff
    # unreachable
    return feedparser.parse("")

def month_range(year, month):
    last_day = calendar.monthrange(year, month)[1]
    start = f"{year}{month:02d}010000"
    end   = f"{year}{month:02d}{last_day:02d}2359"
    return start, end

rows = []
for year in range(2005, 2026):
    print(f"Year {year}")
    for month in range(1, 13): #fetch by month. Sometimes the fetch failed by year
        start_dt, end_dt = month_range(year, month)
        base_query = f"(cat:physics.* OR cat:cond-mat.*) AND submittedDate:[{start_dt} TO {end_dt}]" ## fetch your own field
        start = 0
        total_m = 0
        while True:
            feed = fetch_arxiv(query=base_query, start=start, max_results=1000)
            if not feed.entries:
                break
            got = 0
            for e in feed.entries:
                arxiv_id = e.id.split("/")[-1]
                tags = []
                if "tags" in e:
                    try:
                        tags = [t["term"] for t in e.tags]
                    except Exception:
                        pass
                rows.append({
                    "arxiv_id": arxiv_id,
                    "title": e.title.strip().replace("\n", " ") if "title" in e else "",
                    "abstract": e.summary.strip().replace("\n", " ") if "summary" in e else "",
                    "published": getattr(e, "published", None),
                    "updated": getattr(e, "updated", None),
                    "categories": " ".join(tags) if tags else "",
                })
                got += 1
            total_m += got
            # IMPORTANT: advance even if got < max_results
            start += got
            # polite pacing
            time.sleep(1)
            if got == 0:
                break
        if total_m:
            print(f"  {year}-{month:02d}: {total_m} records")

df = pd.DataFrame(rows).drop_duplicates("arxiv_id")
df.to_csv(OUT, index=False)
print("Saved:", OUT, "with", len(df), "rows")


Year 2005
  2005-01: 1003 records
  2005-02: 899 records
  2005-03: 1121 records
  2005-04: 1115 records
  2005-05: 1067 records
  2005-06: 1137 records
  2005-07: 1062 records
  2005-08: 1070 records
  2005-09: 1000 records
  2005-10: 1207 records
  2005-11: 1000 records
  2005-12: 1099 records
Year 2006
  2006-01: 1042 records
  2006-02: 969 records
  2006-03: 1223 records
  2006-04: 1002 records
  2006-05: 1104 records
  2006-06: 1177 records
  2006-07: 1218 records
  2006-08: 1144 records
  2006-09: 1157 records
  2006-10: 1263 records
  2006-11: 1224 records
  2006-12: 1070 records
Year 2007
  2007-01: 1195 records
  2007-02: 1039 records
  2007-03: 1237 records
  2007-04: 1081 records
  2007-05: 1240 records
  2007-06: 1000 records
  2007-07: 1262 records
  2007-08: 1165 records
  2007-09: 1280 records
  2007-10: 1291 records
  2007-11: 1195 records
  2007-12: 1115 records
Year 2008
  2008-01: 1262 records
  2008-02: 1084 records
  2008-03: 1267 records
  2008-04: 1000 records
  