In [101]:
import os
import re
import textwrap
import datetime as dt
from pathlib import Path
from typing import List

import requests
from bs4 import BeautifulSoup  # type: ignore
import arxiv  # arxiv API wrapper
import openai
import time

from __future__ import annotations

client = OpenAI(api_key="sk-psdwxdblqsanpckjwnkoebqnvpgszjdahyvqpgbexltvrkpx", 
                base_url="https://api.siliconflow.cn/v1",
                )

MODEL = "Qwen/Qwen2.5-32B-Instruct"  


ARXIV_NEW_URL = "https://arxiv.org/list/astro-ph/new"
BATCH_SIZE_API = 20      # arXiv API batch size
BATCH_SIZE_SUM = 10      # #abstracts per summarisation call
MAX_RETRIES = 3          # API retries
RETRY_WAIT = 4           # seconds between retries
MAX_PAPERS = 20

# TOPIC = "Star formation / Planet formation: including star formation in the Milky Way, and the formation of protostellar/protoplanetary disks, planetary systems around stars. Both theoretical and observational papers."

TOPIC = "binary star"

OUTPUT_DIR = './'

In [None]:
classify_sysprompt = (
    "You are an expert astrophysics librarian.Your job is to determine whether a "
    "scientific paper is related to a given research topic based solely on its title.\n\n"
    "Instructions:\n"
    "- Use your domain knowledge in astrophysics to make a judgment.\n"
    "- Only respond with \"Yes\" or \"No\".\n"
    "- Be conservative: if the relation is unclear or indirect, respond \"No\".\n"
    "- Do not explain or elaborate."
)

classify_userprompt_tpl = (
    "Topic: {TOPIC}\nTitle: {title}\nDoes the title belong to the topic above?"
)

summarize_sysprompt = (
    "You are an expert assistant for academic summarization and translation, "
    "specializing in astrophysics papers. \n"
    "Your task is to read English abstracts and generate accurate, concise, and "
    "objective Chinese summaries."
)

summarize_userprompt_header = (
    "Summarize the following astrophysics paper abstracts in Chinese. For each "
    "abstract, produce a 4‑6 sentence summary, and extract 3-4 keywords, following these rules:\n"
    "1. The summary must remain strictly objective. Do not include any subjective "
    "opinions or speculative phrases such as “对……研究具有重要意义” or “为……提供了重要参考”.\n"
    "2. Avoid generic or formulaic language; use concise and precise academic Chinese.\n"
    "3. Ensure key methods, results, and research subjects are not omitted.\n\n"
    "Return your answer in the format:\n"
    "### <arXiv ID>\n<summary>\n"
)
# ───────────────────────────────────────────────

# ╭───────────────────────── helper functions ──────────────────────────╮

def fetch_new_submission_entries(url: str = ARXIV_NEW_URL) -> List[Dict[str, str]]:
    """Return list of dicts with 'id' and 'title' from *New submissions* section."""
    resp = requests.get(url, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    header = soup.find("h3", string=lambda s: s and "new submissions" in s.lower())
    if not header:
        raise RuntimeError("Cannot locate 'New submissions' header – page structure changed.")

    dl = header.find_parent("dl") or header.find_next("dl")
    if not dl:
        raise RuntimeError("Cannot find <dl> container for new submissions.")

    id_regex = re.compile(r"\d{4}\.\d{4,5}")
    entries: List[Dict[str, str]] = []

    dt_tags = dl.find_all("dt", recursive=False)
    dd_tags = dl.find_all("dd", recursive=False)
    for dt_tag, dd_tag in zip(dt_tags, dd_tags):
        link = dt_tag.find("a", href=re.compile(r"/abs/"))
        if not (link and link["href"]):
            continue
        m = id_regex.search(link["href"])
        if not m:
            continue
        arxiv_id = m.group(0)

        title_div = dd_tag.find("div", class_=re.compile(r"list-title"))
        if not title_div:
            continue
        title_text = title_div.get_text(separator=" ", strip=True)
        title_text = title_text.replace("Title:", "").strip()

        entries.append({"id": arxiv_id, "title": title_text})
        if len(entries) >= MAX_PAPERS:
            break
    return entries


def classify_title(title: str) -> bool:
    """LLM Yes/No using conservative astrophysics librarian prompt."""
    user_prompt = classify_userprompt_tpl.format(TOPIC=TOPIC, title=title)
    resp = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": classify_sysprompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0,
    )
    return resp.choices[0].message.content.strip().lower().startswith("y")


def classify_titles(entries: List[Dict[str, str]]) -> List[str]:
    """Loop over titles; return list of IDs whose titles get 'Yes'."""
    matched_ids: List[str] = []
    for entry in entries:
        if classify_title(entry["title"]):
            matched_ids.append(entry["id"])
    return matched_ids


def get_metadata_for_ids(ids: List[str]) -> List[arxiv.Result]:
    """Fetch metadata via arXiv API in batches with retry/back‑off."""
    if not ids:
        return []

    client = arxiv.Client()
    results: List[arxiv.Result] = []
    for start in range(0, len(ids), BATCH_SIZE_API):
        batch = ids[start : start + BATCH_SIZE_API]
        retries = MAX_RETRIES
        while retries:
            try:
                search = arxiv.Search(id_list=batch, max_results=len(batch))
                results.extend(list(client.results(search)))
                break
            except Exception as e:
                retries -= 1
                if retries == 0:
                    print(f"[WARN] Failed batch {batch}: {e}")
                else:
                    wait = RETRY_WAIT * (MAX_RETRIES - retries)
                    print(f"[INFO] Retry in {wait}s … ({retries} left)")
                    time.sleep(wait)
    return results


def batch_summarise(papers: List[arxiv.Result]) -> List[str]:
    """Summarise abstracts in batches; return list of summaries aligned to papers."""
    summaries: List[str] = []
    for start in range(0, len(papers), BATCH_SIZE_SUM):
        batch = papers[start : start + BATCH_SIZE_SUM]
        # Build user prompt with multiple abstracts
        parts = []
        for p in batch:
            parts.append(f"### {p.get_short_id()}\n{p.summary.strip()}")
        user_prompt = summarize_userprompt_header + "\n\n" + "\n\n".join(parts)

        resp = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": summarize_sysprompt},
                {"role": "user", "content": user_prompt},
            ],
            temperature=0.3,
        )
        # Parse response: expect blocks "### ID\nsummary".
        text = resp.choices[0].message.content.strip()
        # Simple split: assume summaries appear in same order
        blocks = [b.strip() for b in text.split("###") if b.strip()]
        for block in blocks:
            summaries.append(block.split("\n", 1)[1].strip() if "\n" in block else block)
    return summaries[: len(papers)]


def build_markdown(papers: List[arxiv.Result], summaries: List[str], date: dt.date) -> str:
    lines = [f"# Daily Planet 行星日报 \n{date}\n"]
    if not papers:
        lines.append("*(No matching papers today.)*\n")
    for paper, summ in zip(papers, summaries):
        authors = ", ".join(a.name for a in paper.authors)

        # 提取摘要和关键词
        keywords = ""
        if "关键词：" in summ:
            parts = summ.rsplit("关键词：", 1)
            summary_body = parts[0].strip()
            keywords = parts[1].strip()
        else:
            summary_body = summ.strip()

        lines.append(
            f"### [{paper.title}]({paper.entry_id})\n"
            f"**Authors**: {authors}\n\n"
            f"**摘要**:\n{summary_body}\n"
        )
        if keywords:
            lines.append(f"**关键词**: {keywords}\n")
    return "\n".join(lines)

In [102]:
entries = fetch_new_submission_entries()


In [103]:
matched_ids = classify_titles(entries)


In [104]:
papers = get_metadata_for_ids(matched_ids)


In [116]:
summaries = batch_summarise(papers)

In [129]:
target_date = datetime.now().date()
OUTPUT_DIR = './astro_ph_daily_picks'
md = build_markdown(papers, summaries, target_date)

out_dir = Path(OUTPUT_DIR); out_dir.mkdir(parents=True, exist_ok=True)
outfile = out_dir / f"astro_ph_{target_date}.md"
outfile.write_text(md, encoding="utf-8")
print(f"[INFO] Wrote {outfile} with {len(summaries)} summaries → {outfile}")

[INFO] Wrote astro_ph_daily_picks/astro_ph_2025-05-27.md with 3 summaries → astro_ph_daily_picks/astro_ph_2025-05-27.md
