In [11]:
# -*- coding: utf-8 -*-
"""
点云 + 深度学习 论文自动检索（终极稳定版）
支持：
  • Google Scholar  → SerpApi（防封、稳定）
  • arXiv          → 官方 API
输出：CSV + BibTeX（高被引优先）
"""

import os
import time
import random
import pandas as pd
from datetime import datetime

# ---------- API ----------
import requests
import feedparser

# ---------- 解析 ----------
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase

# ==================== 配置区 ====================

# SerpApi Key
SERPAPI_KEY = "63eff930ec8b25bae49e14f4d4fd88bd19ab60d118cbf644e63082814710aede"

QUERIES = {
    "scholar": [
        '"point cloud" "deep learning" (classification OR segmentation OR registration)',
        '"point cloud" "PointNet" OR "PointNet++" OR "RandLA-Net" OR "Point Transformer"'
    ],
    "arxiv": [
        "point cloud deep learning",
        "PointNet OR PointNet++ OR RandLA-Net OR Point Transformer"
    ]
}

MIN_YEAR = 2016
MIN_CITATIONS_SCHOLAR = 50
MAX_PAGES = {"scholar": 3, "arxiv": 3}
OUTPUT_DIR = "retrieved_papers"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ===============================================

def random_sleep(min_sec=2, max_sec=5):
    time.sleep(random.uniform(min_sec, max_sec))


# ------------------- Google Scholar (SerpApi) -------------------
def crawl_google_scholar_api(query, max_pages=3, api_key=None):
    if not api_key or not api_key.strip():
        print("SerpApi Key 无效或未提供，跳过 Google Scholar")
        return []

    papers = []
    base_url = "https://serpapi.com/search"

    for page in range(max_pages):
        params = {
            "engine": "google_scholar",
            "q": query,
            "hl": "en",
            "start": page * 10,
            "num": 10,
            "as_ylo": MIN_YEAR,
            "api_key": api_key
        }

        try:
            resp = requests.get(base_url, params=params, timeout=30)
            data = resp.json()

            if "organic_results" not in data:
                print(f"  第 {page+1} 页无结果或 API 限流")
                break

            page_count = 0
            for item in data["organic_results"]:
                try:
                    title = item.get("title", "Unknown Title")
                    link = item.get("link", "")
                    snippet = item.get("snippet", "")

                    # ---------- 引用数 ----------
                    cited = 0
                    inline = item.get("inline_links", {})
                    if inline.get("cited_by"):
                        txt = inline["cited_by"].get("total", "0")
                        if txt:
                            cited = int("".join(filter(str.isdigit, str(txt))))

                    # ---------- 年份（彻底防御性） ----------
                    year = None
                    pub_info = item.get("publication_info", {}).get("summary", "")

                    # 防御：pub_info 可能是 int、None、dict 等
                    if pub_info is None:
                        pub_info = ""
                    elif isinstance(pub_info, (int, float)):
                        pub_info = str(int(pub_info))
                    elif isinstance(pub_info, dict):
                        pub_info = pub_info.get("summary", "") or ""
                    elif not isinstance(pub_info, str):
                        pub_info = str(pub_info)

                    # 提取年份
                    for token in pub_info.replace(",", " ").split():
                        token = token.strip()
                        if token.isdigit() and 1900 < int(token) < 2100:
                            year = int(token)
                            break

                    # ---------- 过滤条件 ----------
                    if year and year >= MIN_YEAR and cited >= MIN_CITATIONS_SCHOLAR:
                        # 作者与期刊分离
                        parts = str(pub_info).split(" - ", 1)
                        authors = parts[0].strip() if len(parts) > 0 else "Unknown"
                        venue = parts[1].strip() if len(parts) > 1 else ""

                        papers.append({
                            "title": title,
                            "authors": authors,
                            "year": year,
                            "venue": venue,
                            "citations": cited,
                            "abstract": snippet[:500],
                            "pdf_link": link,
                            "source": "Google Scholar"
                        })
                        page_count += 1

                except Exception as e:
                    print(f"  解析单篇失败: {e}")
                    continue

            print(f"  第 {page+1} 页抓取 {len(data['organic_results'])} 篇，成功添加 {page_count} 篇，累计 {len(papers)} 篇")
            time.sleep(2)

        except Exception as e:
            print(f"  SerpApi 请求异常: {e}")
            break

    return papers


# ------------------- arXiv API -------------------
def crawl_arxiv_api(query, max_pages=3, max_results_per_page=50):
    papers = []
    base_url = "http://export.arxiv.org/api/query"
    start = 0

    for page in range(max_pages):
        params = {
            "search_query": f"all:{query}",
            "start": start,
            "max_results": max_results_per_page,
            "sortBy": "submittedDate",
            "sortOrder": "descending"
        }

        try:
            resp = requests.get(base_url, params=params, timeout=15)
            if resp.status_code != 200:
                print(f"  arXiv API 错误: {resp.status_code}")
                break

            feed = feedparser.parse(resp.content)
            if not feed.entries:
                print(f"  第 {page+1} 页无结果")
                break

            page_count = 0
            for entry in feed.entries:
                try:
                    title = entry.get("title", "Unknown").strip()
                    authors = ", ".join([a.get("name", "") for a in entry.get("authors", []) if a.get("name")]) or "Unknown"
                    abstract = (entry.get("summary", "") or "").replace("\n", " ")[:500]
                    abs_link = entry.get("link", "")
                    pdf_link = abs_link.replace("/abs/", "/pdf/") + ".pdf" if "/abs/" in abs_link else ""

                    # 年份
                    year = None
                    for field in ("published", "updated"):
                        d = entry.get(field, "")
                        if d and len(d) >= 4:
                            try:
                                year = int(d[:4])
                                break
                            except ValueError:
                                continue
                    if not year or year < 1900:
                        year = 2025

                    if year >= MIN_YEAR:
                        papers.append({
                            "title": title,
                            "authors": authors,
                            "year": year,
                            "venue": "arXiv",
                            "citations": 0,
                            "abstract": abstract,
                            "pdf_link": pdf_link,
                            "source": "arXiv"
                        })
                        page_count += 1

                except Exception as e:
                    print(f"  解析 arXiv 单篇失败: {e}")
                    continue

            print(f"  第 {page+1} 页抓取 {len(feed.entries)} 条，成功添加 {page_count} 篇，累计 {len(papers)} 篇")
            start += max_results_per_page
            time.sleep(3)

        except Exception as e:
            print(f"  arXiv 请求异常: {e}")
            break

    return papers


# ------------------- 主函数 -------------------
def main():
    all_papers = []

    # ---------- Google Scholar ----------
    print("开始检索 Google Scholar...")
    for q in QUERIES["scholar"]:
        print(f"  查询: {q}")
        all_papers.extend(crawl_google_scholar_api(q, MAX_PAGES["scholar"], SERPAPI_KEY))

    # ---------- arXiv ----------
    print("\n开始检索 arXiv...")
    for q in QUERIES["arxiv"]:
        print(f"  查询: {q}")
        all_papers.extend(crawl_arxiv_api(q, MAX_PAGES["arxiv"]))

    # ---------- 字段统一 ----------
    clean = []
    for p in all_papers:
        try:
            clean.append({
                "title": str(p.get("title", "Unknown Title")),
                "authors": str(p.get("authors", "Unknown")),
                "year": int(p.get("year") or 0),
                "venue": str(p.get("venue", "Unknown")),
                "citations": int(p.get("citations") or 0),
                "abstract": str(p.get("abstract", ""))[:500],
                "pdf_link": str(p.get("pdf_link", "")),
                "source": str(p.get("source", "Unknown"))
            })
        except Exception as e:
            print(f"  字段清洗失败: {e}")
            continue

    if not clean:
        print("警告：未抓取到任何有效论文！")
        return

    # ---------- 数据处理 ----------
    df = pd.DataFrame(clean)
    df.drop_duplicates(subset=["title"], inplace=True)
    df = df[df["year"] >= MIN_YEAR]
    df.sort_values(by=["citations", "year"], ascending=[False, False], inplace=True)

    # ---------- 保存 ----------
    ts = datetime.now().strftime("%Y%m%d")
    csv_path = os.path.join(OUTPUT_DIR, f"pointcloud_dl_papers_{ts}.csv")
    df.to_csv(csv_path, index=False, encoding="utf-8-sig")
    print(f"\nCSV 保存: {csv_path}，共 {len(df)} 篇")

    # ---------- BibTeX ----------
    db = BibDatabase()
    db.entries = []
    for _, row in df.iterrows():
        author_str = row["authors"].replace(", ", " and ")
        first = "".join(c for c in row["authors"].split(",")[0].lower() if c.isalnum()) or "unknown"

        entry = {
            "ENTRYTYPE": "misc",
            "ID": f"{first}{row['year']}",
            "title": row["title"],
            "author": author_str,
            "year": str(row["year"]),
            "url": row["pdf_link"],
            "note": f"[{row['source']}] Citations: {row['citations']}"
        }
        if "arxiv" in row["source"].lower():
            entry["howpublished"] = f"\\url{{{row['pdf_link']}}}"
        else:
            entry["journal"] = row["venue"]
        db.entries.append(entry)

    bib_path = os.path.join(OUTPUT_DIR, f"pointcloud_dl_papers_{ts}.bib")
    with open(bib_path, "w", encoding="utf-8") as f:
        writer = BibTexWriter()
        writer.indent = "    "
        f.write(writer.write(db))
    print(f"BibTeX 保存: {bib_path}")


if __name__ == "__main__":
    main()

开始检索 Google Scholar...
  查询: "point cloud" "deep learning" (classification OR segmentation OR registration)
  第 1 页抓取 10 篇，成功添加 7 篇，累计 7 篇
  第 2 页抓取 10 篇，成功添加 6 篇，累计 13 篇
  第 3 页抓取 10 篇，成功添加 4 篇，累计 17 篇
  查询: "point cloud" "PointNet" OR "PointNet++" OR "RandLA-Net" OR "Point Transformer"
  第 1 页抓取 10 篇，成功添加 7 篇，累计 7 篇
  第 2 页抓取 10 篇，成功添加 4 篇，累计 11 篇
  第 3 页抓取 10 篇，成功添加 3 篇，累计 14 篇

开始检索 arXiv...
  查询: point cloud deep learning
  第 1 页抓取 10 条，成功添加 10 篇，累计 10 篇
  第 2 页无结果
  查询: PointNet OR PointNet++ OR RandLA-Net OR Point Transformer
  第 1 页抓取 50 条，成功添加 50 篇，累计 50 篇
  第 2 页抓取 50 条，成功添加 50 篇，累计 100 篇
  第 3 页抓取 50 条，成功添加 50 篇，累计 150 篇

CSV 保存: retrieved_papers\pointcloud_dl_papers_20251026.csv，共 189 篇
BibTeX 保存: retrieved_papers\pointcloud_dl_papers_20251026.bib


In [34]:
import os
import time
import random
import pandas as pd
from datetime import datetime
import requests
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
from tqdm import tqdm
import re
import zipfile
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders

# ==================== 配置区 ====================
SERPAPI_KEY = "63eff930ec8b25bae49e14f4d4fd88bd19ab60d118cbf644e63082814710aede"

# QUERIES = {
#     "scholar": [
#         '"point cloud" "deep learning" (classification OR segmentation OR registration)',
#         '"point cloud" "PointNet" OR "PointNet++" OR "RandLA-Net" OR "Point Transformer"'
#     ],
#     "arxiv": [
#         "point cloud deep learning",
#         "PointNet OR PointNet++ OR RandLA-Net OR Point Transformer"
#     ]
# }
# MIN_YEAR = 2016
# MIN_CITATIONS_SCHOLAR = 50
# MAX_PAGES = {"scholar": 3, "arxiv": 3}
# TOP_N = 50
QUERIES = {
    # ------------------- Google Scholar -------------------
    "scholar": [
        # 1. 核心关键词组合（必须出现）
        '"ICESat-2" ("point cloud" OR "photon cloud" OR lidar OR "laser altimetry")',
        '"ICESat-2" "deep learning" (classification OR segmentation OR registration OR denoising OR bathymetry)',

        # 2. 光子点云 + 深度学习
        '"photon point cloud" "deep learning" (classification OR segmentation OR denoising OR bathymetry)',

        # 3. 卫星激光点云处理
        '"satellite lidar" OR "spaceborne lidar" ("point cloud" OR photon) "deep learning"',

        # 4. 测深（bathymetry）相关
        '"ICESat-2" (bathymetry OR "water depth" OR "underwater topography") "deep learning"',

        # 5. 经典网络在 ICESat‑2 上的应用
        '"ICESat-2" ("PointNet" OR "PointNet++" OR "RandLA-Net" OR "Point Transformer" OR DGCNN OR KPConv)',

        # 6. 噪声去除 / 信号提取
        '"ICESat-2" (denoising OR "signal extraction" OR "photon classification") "deep learning"'
    ],

    # ------------------- arXiv -------------------
    "arxiv": [
        # 1. 基础关键词
        "ICESat-2 point cloud OR photon cloud OR lidar",
        "ICESat-2 deep learning classification OR segmentation OR bathymetry",

        # 2. 光子点云
        "photon point cloud deep learning",

        # 3. 卫星激光雷达
        "satellite lidar OR spaceborne lidar point cloud deep learning",

        # 4. 测深
        "ICESat-2 bathymetry OR water depth deep learning",

        # 5. 网络模型
        "ICESat-2 PointNet OR PointNet++ OR RandLA-Net OR Point Transformer",

        # 6. 去噪 / 分类
        "ICESat-2 denoising OR photon classification deep learning"
    ]
}
MIN_YEAR = 2018                  # ICESat-2 发射年份
MIN_CITATIONS_SCHOLAR = 10       # 放宽引用要求（新领域论文少）
MAX_PAGES = {"scholar": 5, "arxiv": 5}
TOP_N = 100                      # 保留更多候选

# 邮件配置（163 邮箱为例，推荐使用）
EMAIL_SENDER = "2117735297@qq.com"          # 替换成你的新 QQ 邮箱
EMAIL_PASSWORD = "ebqipkzjqifbfcfi"        # 替换成你的授权码
EMAIL_RECEIVER = "zhouyusen@nuaa.edu.cn"
# SMTP_SERVER = "smtp.163.com"
# SMTP_PORT = 465

OUTPUT_DIR = "retrieved_papers"
PDF_DIR = os.path.join(OUTPUT_DIR, "pdfs")
ZIP_PATH = os.path.join(OUTPUT_DIR, "pdfs.zip")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(PDF_DIR, exist_ok=True)

# ===============================================

def random_sleep(min_sec=2, max_sec=5):
    time.sleep(random.uniform(min_sec, max_sec))

def safe_filename(name):
    name = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '_', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name[:100]

def download_pdf(url, filepath, retries=3):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    for attempt in range(retries):
        try:
            resp = requests.get(url, headers=headers, timeout=30, stream=True)
            if resp.status_code == 200:
                with open(filepath, 'wb') as f:
                    for chunk in resp.iter_content(1024*1024):
                        f.write(chunk)
                return True
        except Exception as e:
            print(f"  [重试 {attempt+1}] {e}")
            time.sleep(2)
    return False

# ------------------- Google Scholar -------------------
def crawl_google_scholar_api(query, max_pages=3, api_key=None):
    if not api_key: return []
    papers = []
    base_url = "https://serpapi.com/search"
    for page in range(max_pages):
        params = {
            "engine": "google_scholar", "q": query, "hl": "en",
            "start": page * 10, "num": 10, "as_ylo": MIN_YEAR, "api_key": api_key
        }
        try:
            data = requests.get(base_url, params=params, timeout=30).json()
            if "organic_results" not in data: break
            for item in data["organic_results"]:
                try:
                    title = item.get("title", "Unknown")
                    link = item.get("link", "")
                    snippet = item.get("snippet", "")
                    cited = 0
                    if item.get("inline_links", {}).get("cited_by"):
                        txt = item["inline_links"]["cited_by"].get("total", "0")
                        cited = int("".join(filter(str.isdigit, str(txt)))) if txt else 0
                    year = None
                    pub_info = item.get("publication_info", {}).get("summary", "")
                    if isinstance(pub_info, (int, float)): pub_info = str(int(pub_info))
                    elif not isinstance(pub_info, str): pub_info = str(pub_info)
                    for token in pub_info.replace(",", " ").split():
                        if token.isdigit() and 1900 < int(token) < 2100:
                            year = int(token); break
                    if year and year >= MIN_YEAR and cited >= MIN_CITATIONS_SCHOLAR:
                        parts = pub_info.split(" - ", 1)
                        authors = parts[0].strip() if len(parts) > 0 else "Unknown"
                        venue = parts[1].strip() if len(parts) > 1 else ""
                        papers.append({
                            "title": title, "authors": authors, "year": year, "venue": venue,
                            "citations": cited, "abstract": snippet[:500], "pdf_link": link,
                            "source": "Google Scholar"
                        })
                except: continue
            print(f"  Scholar 第 {page+1} 页累计 {len(papers)} 篇")
            time.sleep(2)
        except: break
    return papers

# ------------------- arXiv -------------------
def crawl_arxiv_api(query, max_pages=3, api_key=None):
    if not api_key: return []
    papers = []
    base_url = "https://serpapi.com/search"
    for page in range(max_pages):
        params = {
            "engine": "google_scholar", "q": f"{query} source:arxiv", "hl": "en",
            "start": page * 10, "num": 10, "as_ylo": MIN_YEAR, "api_key": api_key
        }
        try:
            data = requests.get(base_url, params=params, timeout=30).json()
            if "organic_results" not in data: break
            for item in data["organic_results"]:
                try:
                    title = item.get("title", "Unknown")
                    link = item.get("link", "")
                    snippet = item.get("snippet", "")
                    cited = 0
                    if item.get("inline_links", {}).get("cited_by"):
                        txt = item["inline_links"]["cited_by"].get("total", "0")
                        cited = int("".join(filter(str.isdigit, str(txt)))) if txt else 0
                    year = None
                    pub_info = item.get("publication_info", {}).get("summary", "")
                    if isinstance(pub_info, (int, float)): pub_info = str(int(pub_info))
                    elif not isinstance(pub_info, str): pub_info = str(pub_info)
                    for token in pub_info.replace(",", " ").split():
                        if token.isdigit() and 1900 < int(token) < 2100:
                            year = int(token); break
                    if year and year >= MIN_YEAR and cited >= MIN_CITATIONS_SCHOLAR:
                        authors = "Unknown"
                        if pub_info:
                            parts = pub_info.split(" - ", 1)
                            authors = parts[0].strip() if len(parts) > 0 else "Unknown"
                        pdf_link = ""
                        if "arxiv.org/abs/" in link:
                            arxiv_id = link.split("/abs/")[-1].split("?")[0].split("#")[0]
                            pdf_link = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
                        elif "arxiv.org/pdf/" in link:
                            pdf_link = link
                        papers.append({
                            "title": title, "authors": authors, "year": year, "venue": "arXiv",
                            "citations": cited, "abstract": snippet[:500], "pdf_link": pdf_link or link,
                            "source": "arXiv"
                        })
                except: continue
            print(f"  arXiv 第 {page+1} 页累计 {len(papers)} 篇")
            time.sleep(2)
        except: break
    return papers

# ------------------- 邮件发送 -------------------
def send_email_with_attachments(csv_path, bib_path, zip_path):
    msg = MIMEMultipart()
    msg['From'] = EMAIL_SENDER
    msg['To'] = EMAIL_RECEIVER
    msg['Subject'] = f"【点云论文】Top {TOP_N} 高被引论文- {datetime.now().strftime('%Y%m%d')}"

    body = f"附件为 Top {TOP_N} 篇高被引论文。"
    msg.attach(MIMEText(body, 'plain', 'utf-8'))

    for file_path in [csv_path, bib_path, zip_path]:
        if not os.path.exists(file_path): continue
        with open(file_path, "rb") as f:
            part = MIMEBase('application', 'octet-stream')
            part.set_payload(f.read())
            encoders.encode_base64(part)
            part.add_header('Content-Disposition', f'attachment; filename= {os.path.basename(file_path)}')
            msg.attach(part)

    for attempt in range(3):
        try:
            print(f"正在使用 QQ 邮箱发送（第 {attempt+1} 次）...")
            server = smtplib.SMTP("smtp.qq.com", 587, timeout=20)
            server.ehlo()
            server.starttls()
            server.ehlo()
            server.login(EMAIL_SENDER, EMAIL_PASSWORD)
            server.sendmail(EMAIL_SENDER, EMAIL_RECEIVER, msg.as_string())
            server.quit()
            print(f"邮件发送成功！")
            return
        except Exception as e:
            print(f"失败: {e}")
            time.sleep(3)
    print("发送失败！请检查 QQ 邮箱授权码")

# ------------------- 主函数 -------------------
def main():
    all_papers = []
    print("开始检索 Google Scholar...")
    for q in QUERIES["scholar"]:
        print(f"  查询: {q}")
        all_papers.extend(crawl_google_scholar_api(q, MAX_PAGES["scholar"], SERPAPI_KEY))

    print("\n开始检索 arXiv...")
    for q in QUERIES["arxiv"]:
        print(f"  查询: {q}")
        all_papers.extend(crawl_arxiv_api(q, MAX_PAGES["arxiv"], SERPAPI_KEY))

    clean = []
    for p in all_papers:
        try:
            clean.append({
                "title": str(p.get("title", "Unknown")),
                "authors": str(p.get("authors", "Unknown")),
                "year": int(p.get("year") or 0),
                "venue": str(p.get("venue", "Unknown")),
                "citations": int(p.get("citations") or 0),
                "abstract": str(p.get("abstract", ""))[:500],
                "pdf_link": str(p.get("pdf_link", "")),
                "source": str(p.get("source", "Unknown"))
            })
        except: continue

    if not clean:
        print("未抓到论文！")
        return

    df = pd.DataFrame(clean).drop_duplicates(subset=["title"])
    df = df[df["year"] >= MIN_YEAR]
    df.sort_values(by=["citations", "year"], ascending=[False, False], inplace=True)
    df_top = df.head(TOP_N).copy()

    ts = datetime.now().strftime("%Y%m%d")
    csv_path = os.path.join(OUTPUT_DIR, f"pointcloud_dl_top{TOP_N}_{ts}.csv")
    bib_path = os.path.join(OUTPUT_DIR, f"pointcloud_dl_top{TOP_N}_{ts}.bib")
    df_top.to_csv(csv_path, index=False, encoding="utf-8-sig")

    # BibTeX
    db = BibDatabase()
    for _, row in df_top.iterrows():
        author_str = row["authors"].replace(", ", " and ")
        first = "".join(c for c in row["authors"].split(",")[0].lower() if c.isalnum()) or "unknown"
        entry = {
            "ENTRYTYPE": "misc", "ID": f"{first}{row['year']}", "title": row["title"],
            "author": author_str, "year": str(row["year"]), "url": row["pdf_link"],
            "note": f"[{row['source']}] Citations: {row['citations']}"
        }
        if "arxiv" in row["source"].lower():
            entry["howpublished"] = f"\\url{{{row['pdf_link']}}}"
        else:
            entry["journal"] = row["venue"]
        db.entries.append(entry)
    with open(bib_path, "w", encoding="utf-8") as f:
        f.write(BibTexWriter().write(db))

    # 下载 PDF
    print(f"\n开始下载 Top {TOP_N} 论文 PDF...")
    success = 0
    for _, row in tqdm(df_top.iterrows(), total=len(df_top), desc="下载 PDF"):
        url = row["pdf_link"]
        if not url: continue
        safe_title = safe_filename(row["title"])
        safe_auth = safe_filename(row["authors"].split(",")[0])
        filename = f"{row['year']}_{safe_auth}_{safe_title}.pdf"
        filepath = os.path.join(PDF_DIR, filename)
        if os.path.exists(filepath):
            success += 1
            continue
        if download_pdf(url, filepath):
            success += 1
        time.sleep(1)

    # # 打包 PDF
    with zipfile.ZipFile(ZIP_PATH, 'w', zipfile.ZIP_DEFLATED) as zf:
        for file in os.listdir(PDF_DIR):
            zf.write(os.path.join(PDF_DIR, file), file)

    # print(f"PDF 下载完成：{success}/{len(df_top)} 篇")

    # 发送邮件
    print(f"\n正在发送邮件到 {EMAIL_RECEIVER}...")
    send_email_with_attachments(csv_path, bib_path, ZIP_PATH)

if __name__ == "__main__":
    main()

开始检索 Google Scholar...
  查询: "ICESat-2" ("point cloud" OR "photon cloud" OR lidar OR "laser altimetry")
  Scholar 第 1 页累计 10 篇
  Scholar 第 2 页累计 19 篇
  Scholar 第 3 页累计 26 篇
  Scholar 第 4 页累计 33 篇
  Scholar 第 5 页累计 40 篇
  查询: "ICESat-2" "deep learning" (classification OR segmentation OR registration OR denoising OR bathymetry)
  Scholar 第 1 页累计 7 篇
  Scholar 第 2 页累计 12 篇
  Scholar 第 3 页累计 17 篇
  Scholar 第 4 页累计 21 篇
  Scholar 第 5 页累计 25 篇
  查询: "photon point cloud" "deep learning" (classification OR segmentation OR denoising OR bathymetry)
  Scholar 第 1 页累计 1 篇
  Scholar 第 2 页累计 1 篇
  Scholar 第 3 页累计 4 篇
  Scholar 第 4 页累计 8 篇
  Scholar 第 5 页累计 8 篇
  查询: "satellite lidar" OR "spaceborne lidar" ("point cloud" OR photon) "deep learning"
  Scholar 第 1 页累计 7 篇
  Scholar 第 2 页累计 11 篇
  Scholar 第 3 页累计 18 篇
  Scholar 第 4 页累计 19 篇
  Scholar 第 5 页累计 19 篇
  查询: "ICESat-2" (bathymetry OR "water depth" OR "underwater topography") "deep learning"
  Scholar 第 1 页累计 7 篇
  Scholar 第 2 页累计 12 篇
  Schola

下载 PDF: 100%|██████████| 100/100 [06:33<00:00,  3.93s/it]



正在发送邮件到 zhouyusen@nuaa.edu.cn...
正在使用 QQ 邮箱发送（第 1 次）...
失败: (552, b'Message too large. http://service.mail.qq.com/cgi-bin/help?subtype=1&&id=20022&&no=1000729', '2117735297@qq.com')
正在使用 QQ 邮箱发送（第 2 次）...
失败: (552, b'Message too large. http://service.mail.qq.com/cgi-bin/help?subtype=1&&id=20022&&no=1000729', '2117735297@qq.com')
正在使用 QQ 邮箱发送（第 3 次）...
失败: (552, b'Message too large. http://service.mail.qq.com/cgi-bin/help?subtype=1&&id=20022&&no=1000729', '2117735297@qq.com')
发送失败！请检查 QQ 邮箱授权码


In [31]:
# -*- coding: utf-8 -*-
import os
import time
import pandas as pd
from datetime import datetime
import requests
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
from tqdm import tqdm
import re
import zipfile
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders

# ==================== 配置区 ====================
SERPAPI_KEY = "63eff930ec8b25bae49e14f4d4fd88bd19ab60d118cbf644e63082814710aede"

QUERIES = {
    "scholar": ['"point cloud" "deep learning" (classification OR segmentation OR registration)'],
    "arxiv": ["point cloud deep learning"]
}

MIN_YEAR = 2016
MIN_CITATIONS_SCHOLAR = 50
MAX_PAGES = {"scholar": 1, "arxiv": 1}  # 测试用
TOP_N = 5  # 测试用

# 新 QQ 邮箱（必须）
EMAIL_SENDER = "2117735297@qq.com"          # 替换成你的新 QQ 邮箱
EMAIL_PASSWORD = "ebqipkzjqifbfcfi"        # 替换成你的授权码
EMAIL_RECEIVER = "zhouyusen@nuaa.edu.cn"

OUTPUT_DIR = "retrieved_papers"
PDF_DIR = os.path.join(OUTPUT_DIR, "pdfs")
ZIP_PATH = os.path.join(OUTPUT_DIR, "pdfs.zip")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(PDF_DIR, exist_ok=True)

# ------------------- 邮件发送 -------------------
def send_email_with_attachments(csv_path, bib_path, zip_path):
    msg = MIMEMultipart()
    msg['From'] = EMAIL_SENDER
    msg['To'] = EMAIL_RECEIVER
    msg['Subject'] = f"【点云论文】Top {TOP_N} 高被引论文- {datetime.now().strftime('%Y%m%d')}"

    body = f"附件为 Top {TOP_N} 篇高被引论文。"
    msg.attach(MIMEText(body, 'plain', 'utf-8'))

    for file_path in [csv_path, bib_path, zip_path]:
        if not os.path.exists(file_path): continue
        with open(file_path, "rb") as f:
            part = MIMEBase('application', 'octet-stream')
            part.set_payload(f.read())
            encoders.encode_base64(part)
            part.add_header('Content-Disposition', f'attachment; filename= {os.path.basename(file_path)}')
            msg.attach(part)

    for attempt in range(3):
        try:
            print(f"正在使用 QQ 邮箱发送（第 {attempt+1} 次）...")
            server = smtplib.SMTP("smtp.qq.com", 587, timeout=20)
            server.ehlo()
            server.starttls()
            server.ehlo()
            server.login(EMAIL_SENDER, EMAIL_PASSWORD)
            server.sendmail(EMAIL_SENDER, EMAIL_RECEIVER, msg.as_string())
            server.quit()
            print(f"邮件发送成功！")
            return
        except Exception as e:
            print(f"失败: {e}")
            time.sleep(3)
    print("发送失败！请检查 QQ 邮箱授权码")

# ------------------- 主函数 -------------------
def main():
    ts = datetime.now().strftime("%Y%m%d")
    csv_path = os.path.join(OUTPUT_DIR, f"test_{ts}.csv")
    bib_path = os.path.join(OUTPUT_DIR, f"test_{ts}.bib")
    pd.DataFrame([{"test": 1}]).to_csv(csv_path, index=False)

    db = BibDatabase()
    db.entries = [{"ENTRYTYPE": "misc", "ID": "test", "title": "test"}]
    with open(bib_path, "w", encoding="utf-8") as f:
        f.write(BibTexWriter().write(db))

    with zipfile.ZipFile(ZIP_PATH, 'w') as zf:
        zf.write(csv_path, os.path.basename(csv_path))

    print(f"\n正在发送测试邮件到 {EMAIL_RECEIVER}...")
    send_email_with_attachments(csv_path, bib_path, ZIP_PATH)

if __name__ == "__main__":
    main()


正在发送测试邮件到 zhouyusen@nuaa.edu.cn...
正在使用 QQ 邮箱发送（第 1 次）...
邮件发送成功！


In [None]:
# -*- coding: utf-8 -*-
"""
ICESat-2 + 点云 + 深度学习 全平台检索器（终极版）
支持：Google Scholar / arXiv / Web of Science / CNKI / Scopus
输出：高亮 CSV + BibTeX + PDF + 邮件
"""

import os
import time
import pandas as pd
from datetime import datetime
import requests
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
from tqdm import tqdm
import re
import zipfile
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders

# ==================== 配置区 ====================
SERPAPI_KEY = "63eff930ec8b25bae49e14f4d4fd88bd19ab60d118cbf644e63082814710aede"

QUERIES = {
    "scholar": [
        '"ICESat-2" ("point cloud" OR "photon cloud" OR lidar OR "laser altimetry")',
        '"ICESat-2" "deep learning" (classification OR segmentation OR denoising OR bathymetry)',
        '"photon point cloud" "deep learning"',
        '"ICESat-2" (bathymetry OR "water depth") "deep learning"',
        '"ICESat-2" ("PointNet" OR "PointNet++" OR "RandLA-Net" OR "Point Transformer")'
    ],
    "arxiv": [
        "ICESat-2 point cloud OR photon cloud OR lidar",
        "ICESat-2 deep learning classification OR segmentation OR bathymetry",
        "photon point cloud deep learning",
        "ICESat-2 PointNet OR PointNet++ OR RandLA-Net"
    ],
    "wos": [
        "TS=(\"ICESat-2\" AND (\"point cloud\" OR \"photon cloud\" OR lidar))",
        "TS=(\"ICESat-2\" AND \"deep learning\" AND (classification OR segmentation OR bathymetry))",
        "TS=(\"photon point cloud\" AND \"deep learning\")",
        "TS=(\"ICESat-2\" AND (bathymetry OR \"water depth\"))"
    ],
    "cnki": [
        "SU='ICESat-2' AND (SU='点云' OR SU='光子点云' OR SU='激光雷达')",
        "SU='ICESat-2' AND SU='深度学习' AND (SU='分类' OR SU='分割' OR SU='测深')",
        "SU='光子点云' AND SU='深度学习'",
        "SU='ICESat-2' AND (SU='测深' OR SU='水深')"
    ],
    "scopus": [
        "TITLE-ABS-KEY ( \"ICESat-2\" AND ( \"point cloud\" OR \"photon cloud\" OR lidar ) )",
        "TITLE-ABS-KEY ( \"ICESat-2\" AND \"deep learning\" AND ( classification OR segmentation OR bathymetry ) )",
        "TITLE-ABS-KEY ( \"photon point cloud\" AND \"deep learning\" )",
        "TITLE-ABS-KEY ( \"ICESat-2\" AND ( bathymetry OR \"water depth\" ) )"
    ]
}

MIN_YEAR = 2018
MIN_CITATIONS_SCHOLAR = 10
MAX_PAGES = {"scholar": 3, "arxiv": 3, "wos": 2, "cnki": 2, "scopus": 2}
TOP_N = 100

# 关键词高亮（支持中英文）
HIGHLIGHT_KEYWORDS = [
    "ICESat-2", "point cloud", "photon cloud", "lidar", "deep learning",
    "classification", "segmentation", "bathymetry", "denoising",
    "PointNet", "PointNet++", "RandLA-Net", "Point Transformer",
    "点云", "光子点云", "激光雷达", "深度学习", "分类", "分割", "测深"
]

EMAIL_SENDER = "2117735297@qq.com"
EMAIL_PASSWORD = "ebqipkzjqifbfcfi"
EMAIL_RECEIVER = "zhouyusen@nuaa.edu.cn"

OUTPUT_DIR = "retrieved_papers"
PDF_DIR = os.path.join(OUTPUT_DIR, "pdfs")
ZIP_PATH = os.path.join(OUTPUT_DIR, "pdfs.zip")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(PDF_DIR, exist_ok=True)

# ===============================================

def highlight_text(text, keywords):
    if not text or pd.isna(text): return text
    txt = str(text)
    for kw in keywords:
        txt = re.sub(f"({re.escape(kw)})", r"<b>\1</b>", txt, flags=re.IGNORECASE)
    return txt

def random_sleep(min_sec=2, max_sec=5):
    time.sleep(random.uniform(min_sec, max_sec))

def safe_filename(name):
    name = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '_', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name[:100]

def download_pdf(url, filepath, retries=3):
    headers = {"User-Agent": "Mozilla/5.0"}
    for attempt in range(retries):
        try:
            resp = requests.get(url, headers=headers, timeout=30, stream=True)
            if resp.status_code == 200:
                with open(filepath, 'wb') as f:
                    for chunk in resp.iter_content(1024*1024):
                        f.write(chunk)
                return True
        except Exception as e:
            print(f"  [重试 {attempt+1}] {e}")
            time.sleep(2)
    return False

# ------------------- Google Scholar -------------------
def crawl_google_scholar_api(query, max_pages=3, api_key=None):
    if not api_key: return []
    papers = []
    base_url = "https://serpapi.com/search"
    for page in range(max_pages):
        params = {
            "engine": "google_scholar", "q": query, "hl": "en",
            "start": page * 10, "num": 10, "as_ylo": MIN_YEAR, "api_key": api_key
        }
        try:
            data = requests.get(base_url, params=params, timeout=30).json()
            if "organic_results" not in data: break
            for item in data["organic_results"]:
                try:
                    title = item.get("title", "Unknown")
                    link = item.get("link", "")
                    snippet = item.get("snippet", "")
                    cited = 0
                    if item.get("inline_links", {}).get("cited_by"):
                        txt = item["inline_links"]["cited_by"].get("total", "0")
                        cited = int("".join(filter(str.isdigit, str(txt)))) if txt else 0
                    year = None
                    pub_info = item.get("publication_info", {}).get("summary", "")
                    if isinstance(pub_info, (int, float)): pub_info = str(int(pub_info))
                    elif not isinstance(pub_info, str): pub_info = str(pub_info)
                    for token in pub_info.replace(",", " ").split():
                        if token.isdigit() and 1900 < int(token) < 2100:
                            year = int(token); break
                    if year and year >= MIN_YEAR and cited >= MIN_CITATIONS_SCHOLAR:
                        parts = pub_info.split(" - ", 1)
                        authors = parts[0].strip() if len(parts) > 0 else "Unknown"
                        venue = parts[1].strip() if len(parts) > 1 else ""
                        papers.append({
                            "title": title, "authors": authors, "year": year, "venue": venue,
                            "citations": cited, "abstract": snippet[:500], "pdf_link": link,
                            "source": "Google Scholar"
                        })
                except: continue
            print(f"  Scholar 第 {page+1} 页累计 {len(papers)} 篇")
            time.sleep(2)
        except: break
    return papers

# ------------------- arXiv -------------------
def crawl_arxiv_api(query, max_pages=3, api_key=None):
    if not api_key: return []
    papers = []
    base_url = "https://serpapi.com/search"
    for page in range(max_pages):
        params = {
            "engine": "google_scholar", "q": f"{query} source:arxiv", "hl": "en",
            "start": page * 10, "num": 10, "as_ylo": MIN_YEAR, "api_key": api_key
        }
        try:
            data = requests.get(base_url, params=params, timeout=30).json()
            if "organic_results" not in data: break
            for item in data["organic_results"]:
                try:
                    title = item.get("title", "Unknown")
                    link = item.get("link", "")
                    snippet = item.get("snippet", "")
                    cited = 0
                    if item.get("inline_links", {}).get("cited_by"):
                        txt = item["inline_links"]["cited_by"].get("total", "0")
                        cited = int("".join(filter(str.isdigit, str(txt)))) if txt else 0
                    year = None
                    pub_info = item.get("publication_info", {}).get("summary", "")
                    if isinstance(pub_info, (int, float)): pub_info = str(int(pub_info))
                    elif not isinstance(pub_info, str): pub_info = str(pub_info)
                    for token in pub_info.replace(",", " ").split():
                        if token.isdigit() and 1900 < int(token) < 2100:
                            year = int(token); break
                    if year and year >= MIN_YEAR and cited >= MIN_CITATIONS_SCHOLAR:
                        authors = "Unknown"
                        if pub_info:
                            parts = pub_info.split(" - ", 1)
                            authors = parts[0].strip() if len(parts) > 0 else "Unknown"
                        pdf_link = ""
                        if "arxiv.org/abs/" in link:
                            arxiv_id = link.split("/abs/")[-1].split("?")[0].split("#")[0]
                            pdf_link = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
                        elif "arxiv.org/pdf/" in link:
                            pdf_link = link
                        papers.append({
                            "title": title, "authors": authors, "year": year, "venue": "arXiv",
                            "citations": cited, "abstract": snippet[:500], "pdf_link": pdf_link or link,
                            "source": "arXiv"
                        })
                except: continue
            print(f"  arXiv 第 {page+1} 页累计 {len(papers)} 篇")
            time.sleep(2)
        except: break
    return papers

# ------------------- Web of Science -------------------
def crawl_wos_api(query, max_pages=3, api_key=None):
    if not api_key: return []
    papers = []
    base_url = "https://serpapi.com/search"
    for page in range(max_pages):
        params = {
            "engine": "web_of_science",
            "q": query,
            "page": page + 1,
            "api_key": api_key
        }
        try:
            data = requests.get(base_url, params=params, timeout=30).json()
            if "search_results" not in data: break
            for item in data["search_results"]:
                try:
                    title = item.get("title", "Unknown")
                    link = item.get("link", "")
                    snippet = item.get("abstract", "")[:500]
                    cited = int(item.get("cited_by", 0)) if item.get("cited_by") else 0
                    year = int(item.get("year", 0)) if item.get("year") else None
                    if year and year >= MIN_YEAR and cited >= MIN_CITATIONS_SCHOLAR:
                        authors = item.get("authors", "Unknown")
                        venue = item.get("source", "WoS")
                        papers.append({
                            "title": title, "authors": authors, "year": year, "venue": venue,
                            "citations": cited, "abstract": snippet, "pdf_link": link,
                            "source": "Web of Science"
                        })
                except: continue
            print(f"  WoS 第 {page+1} 页累计 {len(papers)} 篇")
            time.sleep(2)
        except: break
    return papers

# ------------------- CNKI -------------------
def crawl_cnki_api(query, max_pages=3, api_key=None):
    if not api_key: return []
    papers = []
    base_url = "https://serpapi.com/search"
    for page in range(max_pages):
        params = {
            "engine": "cnki",
            "q": query,
            "page": page + 1,
            "api_key": api_key
        }
        try:
            data = requests.get(base_url, params=params, timeout=30).json()
            if "results" not in data: break
            for item in data["results"]:
                try:
                    title = item.get("title", "Unknown")
                    link = item.get("link", "")
                    snippet = item.get("abstract", "")[:500]
                    cited = int(item.get("cited_count", 0)) if item.get("cited_count") else 0
                    year_str = item.get("year", "")
                    year = int(year_str) if year_str.isdigit() else None
                    if year and year >= MIN_YEAR and cited >= MIN_CITATIONS_SCHOLAR:
                        authors = item.get("authors", "Unknown")
                        venue = item.get("journal", "CNKI")
                        pdf_link = link
                        if "kns" in link and "pdf" not in link:
                            pdf_link = link.replace("detail", "pdf")
                        papers.append({
                            "title": title, "authors": authors, "year": year, "venue": venue,
                            "citations": cited, "abstract": snippet, "pdf_link": pdf_link,
                            "source": "CNKI"
                        })
                except: continue
            print(f"  CNKI 第 {page+1} 页累计 {len(papers)} 篇")
            time.sleep(2)
        except: break
    return papers

# ------------------- Scopus -------------------
def crawl_scopus_api(query, max_pages=3, api_key=None):
    if not api_key: return []
    papers = []
    base_url = "https://serpapi.com/search"
    for page in range(max_pages):
        params = {
            "engine": "scopus",
            "q": query,
            "start": page * 25,
            "count": 25,
            "api_key": api_key
        }
        try:
            data = requests.get(base_url, params=params, timeout=30).json()
            if "search_results" not in data: break
            for item in data["search_results"]:
                try:
                    title = item.get("title", "Unknown")
                    link = item.get("link", "")
                    snippet = item.get("abstract", "")[:500]
                    cited = int(item.get("citedby_count", 0)) if item.get("citedby_count") else 0
                    year = int(item.get("cover_date", "")[:4]) if item.get("cover_date") else None
                    if year and year >= MIN_YEAR and cited >= MIN_CITATIONS_SCHOLAR:
                        authors = item.get("creator", "Unknown")
                        venue = item.get("publication_name", "Scopus")
                        papers.append({
                            "title": title, "authors": authors, "year": year, "venue": venue,
                            "citations": cited, "abstract": snippet, "pdf_link": link,
                            "source": "Scopus"
                        })
                except: continue
            print(f"  Scopus 第 {page+1} 页累计 {len(papers)} 篇")
            time.sleep(2)
        except: break
    return papers

# ------------------- 邮件发送 -------------------
def send_email_with_attachments(csv_path, bib_path, zip_path):
    msg = MIMEMultipart()
    msg['From'] = EMAIL_SENDER
    msg['To'] = EMAIL_RECEIVER
    msg['Subject'] = f"【ICESat-2论文】Top {TOP_N} 高被引 - {datetime.now().strftime('%Y%m%d')}"

    body = f"附件为 5 大平台检索的 Top {TOP_N} 篇高被引论文（含关键词高亮）。"
    msg.attach(MIMEText(body, 'plain', 'utf-8'))

    for file_path in [csv_path, bib_path, zip_path]:
        if not os.path.exists(file_path): continue
        with open(file_path, "rb") as f:
            part = MIMEBase('application', 'octet-stream')
            part.set_payload(f.read())
            encoders.encode_base64(part)
            part.add_header('Content-Disposition', f'attachment; filename= {os.path.basename(file_path)}')
            msg.attach(part)

    for attempt in range(3):
        try:
            print(f"正在发送邮件（第 {attempt+1} 次）...")
            server = smtplib.SMTP("smtp.qq.com", 587, timeout=20)
            server.ehlo()
            server.starttls()
            server.ehlo()
            server.login(EMAIL_SENDER, EMAIL_PASSWORD)
            server.sendmail(EMAIL_SENDER, EMAIL_RECEIVER, msg.as_string())
            server.quit()
            print(f"邮件发送成功！")
            return
        except Exception as e:
            print(f"失败: {e}")
            time.sleep(3)
    print("邮件发送失败！")

# ------------------- 主函数 -------------------
def main():
    all_papers = []

    print("开始检索 Google Scholar...")
    for q in QUERIES["scholar"]:
        print(f"  查询: {q}")
        all_papers.extend(crawl_google_scholar_api(q, MAX_PAGES["scholar"], SERPAPI_KEY))

    print("\n开始检索 arXiv...")
    for q in QUERIES["arxiv"]:
        print(f"  查询: {q}")
        all_papers.extend(crawl_arxiv_api(q, MAX_PAGES["arxiv"], SERPAPI_KEY))

    print("\n开始检索 Web of Science...")
    for q in QUERIES["wos"]:
        print(f"  查询: {q}")
        all_papers.extend(crawl_wos_api(q, MAX_PAGES.get("wos", 2), SERPAPI_KEY))

    print("\n开始检索 中国知网...")
    for q in QUERIES["cnki"]:
        print(f"  查询: {q}")
        all_papers.extend(crawl_cnki_api(q, MAX_PAGES.get("cnki", 2), SERPAPI_KEY))

    print("\n开始检索 Scopus...")
    for q in QUERIES["scopus"]:
        print(f"  查询: {q}")
        all_papers.extend(crawl_scopus_api(q, MAX_PAGES.get("scopus", 2), SERPAPI_KEY))

    # 清洗
    clean = []
    for p in all_papers:
        try:
            clean.append({
                "title": str(p.get("title", "Unknown")),
                "authors": str(p.get("authors", "Unknown")),
                "year": int(p.get("year") or 0),
                "venue": str(p.get("venue", "Unknown")),
                "citations": int(p.get("citations") or 0),
                "abstract": str(p.get("abstract", ""))[:500],
                "pdf_link": str(p.get("pdf_link", "")),
                "source": str(p.get("source", "Unknown"))
            })
        except: continue

    if not clean:
        print("未抓到论文！")
        return

    df = pd.DataFrame(clean).drop_duplicates(subset=["title"])
    df = df[df["year"] >= MIN_YEAR]
    df.sort_values(by=["citations", "year"], ascending=[False, False], inplace=True)
    df_top = df.head(TOP_N).copy()

    # 关键词高亮
    df_top["title_hl"] = df_top["title"].apply(lambda x: highlight_text(x, HIGHLIGHT_KEYWORDS))
    df_top["abstract_hl"] = df_top["abstract"].apply(lambda x: highlight_text(x, HIGHLIGHT_KEYWORDS))

    ts = datetime.now().strftime("%Y%m%d")
    csv_path = os.path.join(OUTPUT_DIR, f"icesat2_highlight_top{TOP_N}_{ts}.csv")
    bib_path = os.path.join(OUTPUT_DIR, f"icesat2_top{TOP_N}_{ts}.bib")

    # 保存高亮 CSV（可用 Excel 打开看到 <b> 加粗）
    df_top[["title_hl", "authors", "year", "venue", "citations", "source", "abstract_hl", "pdf_link"]].to_csv(
        csv_path, index=False, encoding="utf-8-sig"
    )

    # BibTeX
    db = BibDatabase()
    for _, row in df_top.iterrows():
        author_str = row["authors"].replace(", ", " and ")
        first = "".join(c for c in row["authors"].split(",")[0].lower() if c.isalnum()) or "unknown"
        entry = {
            "ENTRYTYPE": "misc", "ID": f"{first}{row['year']}", "title": row["title"],
            "author": author_str, "year": str(row["year"]), "url": row["pdf_link"],
            "note": f"[{row['source']}] Citations: {row['citations']}"
        }
        if "arxiv" in row["source"].lower():
            entry["howpublished"] = f"\\url{{{row['pdf_link']}}}"
        else:
            entry["journal"] = row["venue"]
        db.entries.append(entry)
    with open(bib_path, "w", encoding="utf-8") as f:
        f.write(BibTexWriter().write(db))

    # 下载 PDF
    print(f"\n开始下载 Top {TOP_N} 论文 PDF...")
    success = 0
    for _, row in tqdm(df_top.iterrows(), total=len(df_top), desc="下载 PDF"):
        url = row["pdf_link"]
        if not url: continue
        safe_title = safe_filename(row["title"])
        safe_auth = safe_filename(row["authors"].split(",")[0])
        filename = f"{row['year']}_{safe_auth}_{safe_title}.pdf"
        filepath = os.path.join(PDF_DIR, filename)
        if os.path.exists(filepath):
            success += 1
            continue
        if download_pdf(url, filepath):
            success += 1
        time.sleep(1)

    # 打包
    with zipfile.ZipFile(ZIP_PATH, 'w', zipfile.ZIP_DEFLATED) as zf:
        for file in os.listdir(PDF_DIR):
            zf.write(os.path.join(PDF_DIR, file), file)

    # 发送邮件
    print(f"\n正在发送邮件到 {EMAIL_RECEIVER}...")
    send_email_with_attachments(csv_path, bib_path, ZIP_PATH)

if __name__ == "__main__":
    main()