In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import traceback
import time
import os
import shutil
import csv

In [1]:
def build_base_url(start_year, end_year):
    return (
        "https://pubmed.ncbi.nlm.nih.gov/?term=Glomerulus"
        "&filter=pubt.casereports"
        "&filter=pubt.clinicalconference"
        "&filter=pubt.clinicalstudy"
        "&filter=pubt.clinicaltrial"
        "&filter=pubt.clinicaltrialphasei"
        "&filter=pubt.clinicaltrialphaseii"
        "&filter=pubt.clinicaltrialprotocol"
        "&filter=pubt.guideline"
        "&filter=pubt.legalcase"
        "&filter=pubt.legislation"
        "&filter=pubt.meta-analysis"
        "&filter=pubt.multicenterstudy"
        "&filter=pubt.observationalstudy"
        "&filter=pubt.randomizedcontrolledtrial"
        "&filter=pubt.systematicreview"
        "&filter=pubt.veterinaryclinicaltrial"
        f"&filter=years.{start_year}-{end_year}"
    )

def get_total_pages(driver, base_url):
    driver.get(base_url + "&page=1")
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CLASS_NAME, "of-total-pages"))
        )
        total_text = driver.find_element(By.CLASS_NAME, "of-total-pages").text
        total_pages = int(total_text.split()[-1])
        print(f"总页数：{total_pages}")
        return total_pages
    except Exception as e:
        print("获取总页数失败")
        traceback.print_exc()
        return 0

def get_titles_from_page(driver, url):
    driver.get(url)
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "article.full-docsum"))
        )
        articles = driver.find_elements(By.CSS_SELECTOR, "article.full-docsum")
        titles = [article.find_element(By.CSS_SELECTOR, "a.docsum-title").text.strip()
                  for article in articles]
        return titles
    except Exception as e:
        print("抓取标题失败")
        traceback.print_exc()
        return []

def save_to_txt(titles, file_path):
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write("\n".join([f"{i+1}. {t}" for i, t in enumerate(titles)]))
        print(f"文件已保存至：{file_path}")
    except Exception as e:
        print("文件保存失败")
        traceback.print_exc()

def main(start_year, end_year, output_path):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    try:
        base_url = build_base_url(start_year, end_year)
        total_pages = get_total_pages(driver, base_url)
        if total_pages == 0:
            print("没有获取到有效页数，程序退出。")
            return

        all_titles = []
        for page in range(1, total_pages + 1):
            print(f"正在抓取第 {page}/{total_pages} 页...")
            page_url = f"{base_url}&page={page}"
            titles = get_titles_from_page(driver, page_url)
            if titles:
                all_titles.extend(titles)
                print(f"获取到 {len(titles)} 个标题")
            else:
                print("本页无数据或抓取失败")
            time.sleep(1)

        save_to_txt(all_titles, output_path)

    finally:
        driver.quit()

if __name__ == "__main__":
    # 用户配置区域
    start_year = 2015
    end_year = 2025
    output_path = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/paper_list/pubmed_titles.txt"

    main(start_year, end_year, output_path)


总页数：81
正在抓取第 1/81 页...
获取到 10 个标题
正在抓取第 2/81 页...
获取到 10 个标题
正在抓取第 3/81 页...
获取到 10 个标题
正在抓取第 4/81 页...
获取到 10 个标题
正在抓取第 5/81 页...
获取到 10 个标题
正在抓取第 6/81 页...
获取到 10 个标题
正在抓取第 7/81 页...
获取到 10 个标题
正在抓取第 8/81 页...
获取到 10 个标题
正在抓取第 9/81 页...
获取到 10 个标题
正在抓取第 10/81 页...
获取到 10 个标题
正在抓取第 11/81 页...
获取到 10 个标题
正在抓取第 12/81 页...
获取到 10 个标题
正在抓取第 13/81 页...
获取到 10 个标题
正在抓取第 14/81 页...
获取到 10 个标题
正在抓取第 15/81 页...
获取到 10 个标题
正在抓取第 16/81 页...
获取到 10 个标题
正在抓取第 17/81 页...
获取到 10 个标题
正在抓取第 18/81 页...
获取到 10 个标题
正在抓取第 19/81 页...
获取到 10 个标题
正在抓取第 20/81 页...
获取到 10 个标题
正在抓取第 21/81 页...
获取到 10 个标题
正在抓取第 22/81 页...
获取到 10 个标题
正在抓取第 23/81 页...
获取到 10 个标题
正在抓取第 24/81 页...
获取到 10 个标题
正在抓取第 25/81 页...
获取到 10 个标题
正在抓取第 26/81 页...
获取到 10 个标题
正在抓取第 27/81 页...
获取到 10 个标题
正在抓取第 28/81 页...
获取到 10 个标题
正在抓取第 29/81 页...
获取到 10 个标题
正在抓取第 30/81 页...
获取到 10 个标题
正在抓取第 31/81 页...
获取到 10 个标题
正在抓取第 32/81 页...
获取到 10 个标题
正在抓取第 33/81 页...
获取到 10 个标题
正在抓取第 34/81 页...
获取到 10 个标题
正在抓取第 35/81 页...
获取到 10 个标题
正在抓取第 36/81 页...
获取到 1

In [2]:
def load_titles_from_txt(titles_path):
    with open(titles_path, 'r', encoding='utf-8') as f:
        return set(line.strip().split(". ", 1)[-1] for line in f if line.strip())

def filter_and_copy(source_folder, titles_set, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(source_folder):
        if filename.endswith('.csv'):
            csv_path = os.path.join(source_folder, filename)
            try:
                with open(csv_path, 'r', encoding='utf-8') as csvfile:
                    reader = csv.DictReader(csvfile)
                    for row in reader:
                        paper_title = row.get("paper_title", "").strip()
                        if paper_title in titles_set:
                            # Copy CSV
                            shutil.copy(csv_path, os.path.join(output_folder, filename))
                            
                            # Copy corresponding JPG
                            image_name = row.get("image_name", "").strip()
                            image_path = os.path.join(source_folder, image_name)
                            if os.path.exists(image_path):
                                shutil.copy(image_path, os.path.join(output_folder, image_name))
                            else:
                                print(f"[警告] 找不到对应图片: {image_path}")
                            break  # 匹配到就跳出，处理下一个 CSV
            except Exception as e:
                print(f"[错误] 处理文件失败: {filename}")
                print(e)

# ==== 用户配置区 ====
titles_path = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/paper_list/human.txt"
source_folder = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed"
output_folder = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/human"

# ==== 执行 ====
titles_set = load_titles_from_txt(titles_path)
filter_and_copy(source_folder, titles_set, output_folder)
print("✅ 筛选和复制完成！")


✅ 筛选和复制完成！
