In [None]:
import os
import json
import time
import pandas as pd
import requests
import csv
import traceback
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urljoin
from bs4 import BeautifulSoup

In [None]:
## step 1
## 获取每个页面的links
## get all links from the page

def scrape_links(pages_to_scrape=128, output_path="/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/all_links.csv"):
    """
    get all urls from pages
    
    :param pages_to_scrape: the number of page need to be scrape。
    :param output_path: CSV output path。
    """
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    pages = []
    for page_num in range(pages_to_scrape):
        m = 1 + page_num * 100
        n = 100 + page_num * 100
        url = f'https://openi.nlm.nih.gov/gridquery?q=glomerulus&m={m}&n={n}&it=xg'
        
        print(f"get {page_num + 1} page: {url}")
        
        driver.get(url)
        
        time.sleep(6)
        
        html_content = driver.page_source
        
        soup = BeautifulSoup(html_content, 'html.parser')

        for a_tag in soup.select('div#grid a'):
            href = a_tag.get('href')
            if href:
                full_url = 'https://openi.nlm.nih.gov/' + href
                pages.append(full_url)

    df = pd.DataFrame(pages, columns=['Links'])

    if df.empty:
        print("DataFrame is empty, cannot save")
    else:
        df.to_csv(output_path, index=False)
        print(f"saved to: {output_path}")

    driver.quit()

if __name__ == '__main__':
    scrape_links(
        pages_to_scrape=128, 
        output_path="/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/all_links.csv"
    )

In [1]:
## step 2
## 获取pubmed 链接
## get pubmed link from the link we got before

def extract_href_from_links(csv_file_path, output_json):
    df = pd.read_csv(csv_file_path)
    urls = df["Links"].dropna().tolist()

    os.makedirs(os.path.dirname(output_json), exist_ok=True)

    if os.path.exists(output_json):
        with open(output_json, "r", encoding="utf-8") as f:
            try:
                extracted_data = json.load(f)
            except json.JSONDecodeError:
                print("⚠️ JSON 文件损坏或为空，重新开始")
                extracted_data = []
    else:
        extracted_data = []

    processed_links = set(entry["Original_Link"] for entry in extracted_data)

    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920x1080')
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.90 Safari/537.36")
    options.add_argument('--disable-blink-features=AutomationControlled')

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    for i, url in enumerate(urls, start=1):
        if url in processed_links:
            print(f"⚠️ 已提取过，跳过第 {i} 个链接: {url}")
            continue

        extracted_href = None
        print(f"🔍 正在处理第 {i} 个链接: {url}")
        try:
            time.sleep(2)
            driver.get(url)

            # 智能方式
            try:
                pubmed_link_element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//a[contains(text(), "PubMed") and not(contains(text(), "Central"))]'))
                )
                extracted_href = pubmed_link_element.get_attribute("href")
                print(f"✅ 智能匹配成功: {extracted_href}")

            except:
                print("⚠️ 智能匹配失败，尝试 fallback XPath...")
                try:
                    pubmed_link_element = WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located((By.XPATH, '/html/body/app-root/div/app-detailedresult/div[1]/div/div[1]/div[5]/span[2]/span[2]/a'))
                    )
                    extracted_href = pubmed_link_element.get_attribute("href")
                    print(f"✅ fallback 匹配成功: {extracted_href}")
                except Exception as e:
                    print(f"❌ fallback 匹配失败: {e}")
                    extracted_href = None

        except Exception as e:
            print(f"❌ 页面加载失败: {e}")

        new_entry = {
            "Original_Link": url,
            "Extracted_Href": extracted_href
        }
        extracted_data.append(new_entry)
        processed_links.add(url)

        with open(output_json, "w", encoding="utf-8") as f:
            json.dump(extracted_data, f, indent=4, ensure_ascii=False)
        print(f"💾 已写入：{output_json}（共 {len(extracted_data)} 条）")

    driver.quit()
    print(f"✅ 所有链接处理完成，最终结果保存在：{output_json}")


if __name__ == '__main__':
    csv_file_path = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/all_links.csv"
    output_json = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/pubmed.json"

    extract_href_from_links(csv_file_path, output_json)


🔍 正在处理第 1 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC5426794_pgen.1006751.g004&query=glomerulus&it=xg&req=4&npos=1
✅ 智能匹配成功: http://www.ncbi.nlm.nih.gov/pubmed/28448523
🔍 正在处理第 2 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC3412803_pone.0042814.g001&query=glomerulus&it=xg&req=4&npos=2
✅ 智能匹配成功: http://www.ncbi.nlm.nih.gov/pubmed/22880115
🔍 正在处理第 3 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC4928931_pone.0157497.g003&query=glomerulus&it=xg&req=4&npos=3
✅ 智能匹配成功: http://www.ncbi.nlm.nih.gov/pubmed/27362433
🔍 正在处理第 4 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC5133832_ijms-17-01831-g005&query=glomerulus&it=xg&req=4&npos=4
✅ 智能匹配成功: http://www.ncbi.nlm.nih.gov/pubmed/27827846
🔍 正在处理第 5 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC4398543_pone.0116700.g001&query=glomerulus&it=xg&req=4&npos=5
✅ 智能匹配成功: http://www.ncbi.nlm.nih.gov/pubmed/25875837
🔍 正在处理第 6 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC3151521_ECAM2012-235358.006&query=glomeru

In [5]:
# step 3
## 获取所有的图片和对应的文字 txt
## get all images and caption(txt) from the pubmed link

def download_image(image_url, save_path):
    """
    下载图片
    download images
    """
    try:
        response = requests.get(image_url, stream=True, timeout=15)
        if response.status_code == 200:
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print(f"✅ 下载成功: {save_path}")
        else:
            print(f"⚠️ 无法下载: {image_url} - 状态码: {response.status_code}")
    except Exception as e:
        print(f"❌ 下载失败: {image_url} - {e}")


def extract_and_download_images(json_file, output_folder):
    """从 JSON 读取链接，爬取所有符合条件的图片链接和描述并保存
    Read links from JSON, get all eligible images and text descriptions and save them
    """

    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    os.makedirs(output_folder, exist_ok=True)

    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.90 Safari/537.36")
    options.add_argument('--disable-blink-features=AutomationControlled')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    for index, item in enumerate(data, start=1):
        url = item.get("Extracted_Href")
        if not url:
            print(f"❌ 跳过 {index}，没有链接")
            continue

        try:
            print(f"\n🌐 正在访问链接 [{index}]: {url}")
            driver.get(url)

            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "figure"))
            )

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            figures = soup.find_all("figure")

            print(f"🔍 找到 {len(figures)} 个 <figure> 标签")

            if not figures:
                print(f"⚠️ 没有找到任何 figure，可能网页结构有变")
                continue

            for fig_index, figure in enumerate(figures, start=1):
                a_tag = figure.find("a", class_="figure-link")
                img_url = a_tag['href'] if a_tag and a_tag.get('href') else None

                if not img_url:
                    print(f"⚠️ 第 {fig_index} 个 figure 没有找到 <a> 中的图片链接")
                    continue

                img_url = urljoin(url, img_url)

                figcaption_tag = figure.find("figcaption")
                caption_text = figcaption_tag.get_text(separator=" ", strip=True) if figcaption_tag else "无描述"

                print(f"🖼️ 第 {fig_index} 图像链接: {img_url}")
                print(f"📝 第 {fig_index} 图像描述: {caption_text[:100]}...")  # 截断预览前100字符

                image_name = f"{index}_{fig_index}.jpg"
                image_path = os.path.join(output_folder, image_name)
                download_image(img_url, image_path)

                caption_name = f"{index}_{fig_index}.txt"
                caption_path = os.path.join(output_folder, caption_name)
                with open(caption_path, 'w', encoding='utf-8') as f:
                    f.write(caption_text)

        except Exception as e:
            print(f"❌ 处理 {url} 出错：{e}")
            traceback.print_exc()

    driver.quit()
    print("🎉 全部处理完成！")


if __name__ == '__main__':
    json_file = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/pubmed.json"
    output_folder = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/pubmed"

    extract_and_download_images(json_file, output_folder)



🌐 正在访问链接 [1]: http://www.ncbi.nlm.nih.gov/pubmed/28448523
🔍 找到 5 个 <figure> 标签
🖼️ 第 1 图像链接: https://cdn.ncbi.nlm.nih.gov/pmc/blobs/8be1/5426794/9adbcc64ec35/pgen.1006751.g001.jpg
📝 第 1 图像描述: Fig 1. Dendrites of DA1 and diffuse vPNs mis-targeted to the DA3 glomerulus in the Sema-1a P1 mutant...
✅ 下载成功: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/pubmed/1_1.jpg
🖼️ 第 2 图像链接: https://cdn.ncbi.nlm.nih.gov/pmc/blobs/8be1/5426794/4ca34f2157d6/pgen.1006751.g002.jpg
📝 第 2 图像描述: Fig 2. Dendrites of embryonic- and larval-born adPNs aberrantly invaded into the DA3 glomerulus in t...
✅ 下载成功: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/pubmed/1_2.jpg
🖼️ 第 3 图像链接: https://cdn.ncbi.nlm.nih.gov/pmc/blobs/8be1/5426794/75a6aad08ba6/pgen.1006751.g003.jpg
📝 第 3 图像描述: Fig 3. Manipulating Sema-1a expression resulted in DA3 adPN dendrites to mis-project away from the D...
✅ 下载成功: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/pubmed/1_3.jpg
🖼️ 第 4 图像链接: https://cdn.ncbi.nlm.nih.gov/pm

In [12]:
## step 3
## 获取所有的图片和对应的文字 csv
## get all images and caption(csv) from the link

def download_image(image_url, save_path):
    """下载并保存图片
    download image
    """
    try:
        response = requests.get(image_url, stream=True, timeout=15)
        if response.status_code == 200:
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print(f"✅ 下载成功: {save_path}")
        else:
            print(f"⚠️ 无法下载: {image_url} - 状态码: {response.status_code}")
    except Exception as e:
        print(f"❌ 下载失败: {image_url} - {e}")

def extract_and_download_images(json_file, output_folder):
    """从 JSON 读取链接，爬取所有符合条件的图片、描述、标题、DOI，并保存到 CSV
    Read links from JSON, get all eligible images, descriptions, titles, DOIs, and save to CSV
    """

    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    os.makedirs(output_folder, exist_ok=True)

    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.90 Safari/537.36")
    options.add_argument('--disable-blink-features=AutomationControlled')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    for index, item in enumerate(data, start=1):
        url = item.get("Extracted_Href")
        if not url:
            print(f"❌ 跳过 {index}，没有链接")
            continue

        try:
            print(f"\n🌐 正在访问链接 [{index}]: {url}")
            driver.get(url)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            title_tag = soup.select_one("h1.heading-title")
            paper_title = title_tag.get_text(strip=True) if title_tag else "无标题"

            doi_tag = soup.select_one("span.citation-doi")
            raw_doi = doi_tag.get_text(strip=True) if doi_tag else "无DOI"
            doi_link = raw_doi.replace("doi:", "").strip() if "doi:" in raw_doi else raw_doi

            print(f"📄 文章标题: {paper_title}")
            print(f"🔗 DOI: {doi_link}")

            figures = soup.find_all("figure")
            print(f"🖼️ 找到 {len(figures)} 个 <figure> 标签")

            for fig_index, figure in enumerate(figures, start=1):
                a_tag = figure.find("a", class_="figure-link")
                img_url = urljoin(url, a_tag["href"]) if a_tag and a_tag.get("href") else None
                if not img_url:
                    print(f"⚠️ 第 {fig_index} 图没有图片链接，跳过")
                    continue

                caption_tag = figure.find("figcaption")
                caption_text = caption_tag.get_text(separator=" ", strip=True) if caption_tag else "无描述"

                image_name = f"{index}_{fig_index}.jpg"
                image_path = os.path.join(output_folder, image_name)
                download_image(img_url, image_path)

                csv_name = f"{index}_{fig_index}.csv"
                csv_path = os.path.join(output_folder, csv_name)

                with open(csv_path, "w", encoding="utf-8", newline='') as csvfile:
                    writer = csv.DictWriter(csvfile, fieldnames=["image_name", "caption_text", "paper_title", "doi_link"])
                    writer.writeheader()
                    writer.writerow({
                        "image_name": image_name,
                        "caption_text": caption_text,
                        "paper_title": paper_title,
                        "doi_link": doi_link
                    })

                print(f"✅ CSV 已保存: {csv_path}")

        except Exception as e:
            print(f"❌ 处理链接 {url} 时出错：{e}")
            traceback.print_exc()

    driver.quit()
    print("\n🎉 所有任务完成！")

if __name__ == '__main__':
    json_file = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/pubmed.json"
    output_folder = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed"

    extract_and_download_images(json_file, output_folder)



🌐 正在访问链接 [1]: http://www.ncbi.nlm.nih.gov/pubmed/28448523
📄 文章标题: Semaphorin-1a prevents Drosophila olfactory projection neuron dendrites from mis-targeting into select antennal lobe regions
🔗 DOI: 10.1371/journal.pgen.1006751.
🖼️ 找到 5 个 <figure> 标签
✅ 下载成功: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed/1_1.jpg
✅ CSV 已保存: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed/1_1.csv
✅ 下载成功: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed/1_2.jpg
✅ CSV 已保存: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed/1_2.csv
✅ 下载成功: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed/1_3.jpg
✅ CSV 已保存: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed/1_3.csv
✅ 下载成功: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed/1_4.jpg
✅ CSV 已保存: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed/1_4.csv
✅ 下载成功: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed/1_5.jpg
✅ CSV 已保