In [1]:
import os
import json
import time
import pandas as pd
import requests
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

In [2]:
## step 1
## 获取每个页面的links
## get all links from the page

def scrape_links(pages_to_scrape=128, output_path="/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/all_links.csv"):
    """
    get all urls from pages
    
    :param pages_to_scrape: the number of page need to be scrape。
    :param output_path: CSV output path。
    """
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    pages = []
    for page_num in range(pages_to_scrape):
        m = 1 + page_num * 100
        n = 100 + page_num * 100
        url = f'https://openi.nlm.nih.gov/gridquery?q=glomerulus&m={m}&n={n}&it=xg'
        
        print(f"get {page_num + 1} page: {url}")
        
        driver.get(url)
        
        time.sleep(6)
        
        html_content = driver.page_source
        
        soup = BeautifulSoup(html_content, 'html.parser')

        for a_tag in soup.select('div#grid a'):
            href = a_tag.get('href')
            if href:
                full_url = 'https://openi.nlm.nih.gov/' + href
                pages.append(full_url)

    df = pd.DataFrame(pages, columns=['Links'])

    if df.empty:
        print("DataFrame is empty, cannot save")
    else:
        df.to_csv(output_path, index=False)
        print(f"saved to: {output_path}")

    driver.quit()

if __name__ == '__main__':
    scrape_links(
        pages_to_scrape=128, 
        output_path="/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/all_links.csv"
    )

get 1 page: https://openi.nlm.nih.gov/gridquery?q=glomerulus&m=1&n=100&it=xg
get 2 page: https://openi.nlm.nih.gov/gridquery?q=glomerulus&m=101&n=200&it=xg
get 3 page: https://openi.nlm.nih.gov/gridquery?q=glomerulus&m=201&n=300&it=xg
get 4 page: https://openi.nlm.nih.gov/gridquery?q=glomerulus&m=301&n=400&it=xg
get 5 page: https://openi.nlm.nih.gov/gridquery?q=glomerulus&m=401&n=500&it=xg
get 6 page: https://openi.nlm.nih.gov/gridquery?q=glomerulus&m=501&n=600&it=xg
get 7 page: https://openi.nlm.nih.gov/gridquery?q=glomerulus&m=601&n=700&it=xg
get 8 page: https://openi.nlm.nih.gov/gridquery?q=glomerulus&m=701&n=800&it=xg
get 9 page: https://openi.nlm.nih.gov/gridquery?q=glomerulus&m=801&n=900&it=xg
get 10 page: https://openi.nlm.nih.gov/gridquery?q=glomerulus&m=901&n=1000&it=xg
get 11 page: https://openi.nlm.nih.gov/gridquery?q=glomerulus&m=1001&n=1100&it=xg
get 12 page: https://openi.nlm.nih.gov/gridquery?q=glomerulus&m=1101&n=1200&it=xg
get 13 page: https://openi.nlm.nih.gov/gridque

KeyboardInterrupt: 

In [None]:
## step 2
## 获取pubmed central链接
## get pubmed central link from the link we got before


import os
import json
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


def extract_href_from_links(csv_file_path, output_json):
    """
    从 CSV 读取 URL，提取 'PubMed Central' 链接，并实时写入 JSON 文件
    Read URLs from CSV, extract 'PubMed Central' links, and write to JSON file in real time
    """

    df = pd.read_csv(csv_file_path)
    urls = df["Links"].dropna().tolist()

    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920x1080')
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.90 Safari/537.36")
    options.add_argument('--disable-blink-features=AutomationControlled')

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    os.makedirs(os.path.dirname(output_json), exist_ok=True)

    if os.path.exists(output_json):
        with open(output_json, "r", encoding="utf-8") as f:
            extracted_data = json.load(f)
    else:
        extracted_data = []

    for i, url in enumerate(urls, start=1):
        if any(entry["Original_Link"] == url for entry in extracted_data):
            print(f"⚠️ 已处理过，跳过第 {i} 个链接: {url}")
            continue

        try:
            print(f"🌐 正在处理第 {i} 个链接: {url}")
            driver.get(url)

            pubmed_link_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//a[contains(text(), "PubMed Central")]'))
            )

            extracted_href = pubmed_link_element.get_attribute("href")
            print(f"✅ 提取成功: {extracted_href}")

        except Exception as e:
            print(f"❌ 提取失败: {e}")
            extracted_href = None

        entry = {
            "Original_Link": url,
            "Extracted_Href": extracted_href
        }
        extracted_data.append(entry)

        with open(output_json, "w", encoding="utf-8") as f:
            json.dump(extracted_data, f, indent=4, ensure_ascii=False)

    driver.quit()
    print(f"🎉 所有链接处理完成，已保存至: {output_json}")


if __name__ == '__main__':
    csv_file_path = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/all_links.csv"
    output_json = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/pubmed_central.json"

    extract_href_from_links(csv_file_path, output_json)


正在处理第 1 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC5426794_pgen.1006751.g004&query=glomerulus&it=xg&req=4&npos=1
✅ 第 1 个链接提取的 PubMed Central href: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC5426794
正在处理第 2 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC3412803_pone.0042814.g001&query=glomerulus&it=xg&req=4&npos=2
✅ 第 2 个链接提取的 PubMed Central href: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3412803
正在处理第 3 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC4928931_pone.0157497.g003&query=glomerulus&it=xg&req=4&npos=3
✅ 第 3 个链接提取的 PubMed Central href: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4928931
正在处理第 4 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC5133832_ijms-17-01831-g005&query=glomerulus&it=xg&req=4&npos=4
✅ 第 4 个链接提取的 PubMed Central href: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC5133832
正在处理第 5 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC4398543_pone.0116700.g001&query=glomerulus&it=xg&req=4&npos=5
✅ 第 5 个链接提取的 PubMed Central href: http:/

In [3]:
## step 3
## 获取所有的图片和对应的文字 csv
## get all images and caption(csv) from the link

def download_image(image_url, save_path):
    """
    下载并保存图片
    Download the image
    """
    try:
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print(f"✅ 下载成功: {save_path}")
        else:
            print(f"⚠️ 无法下载: {image_url}")
    except Exception as e:
        print(f"❌ 下载失败: {image_url} - {e}")

def extract_and_download_images(json_file, output_folder):
    """
    从 JSON 读取链接，爬取所有符合条件的图片和文字描述并保存
    Read links from JSON, get all eligible images and text descriptions and save them
    """

    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    os.makedirs(output_folder, exist_ok=True)
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920x1080')
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.90 Safari/537.36")
    options.add_argument('--disable-blink-features=AutomationControlled')

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    for index, item in enumerate(data, start=1):
        url = item.get("Extracted_Href")
        if not url:
            print(f"❌ 跳过 {index}，因为没有可用的链接")
            continue

        try:
            print(f"\n🔍 正在访问第 {index} 个链接: {url}")
            driver.get(url)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            h1_tag = soup.select_one(".ameta.p.font-secondary.font-xs h1")
            paper_title = h1_tag.get_text(strip=True) if h1_tag else "无标题"

            doi_tag = soup.select_one(".usa-link.usa-link--external")
            doi_link = doi_tag.get("href") if doi_tag else "无链接"

            print(f"📄 文章标题: {paper_title}")
            print(f"🔗 DOI 链接: {doi_link}")

            figures = soup.find_all("figure")
            if not figures:
                print(f"⚠️ 页面中未找到 <figure> 标签")
                continue

            for fig_index, figure in enumerate(figures, start=1):
                img_tag = figure.find("img")
                caption_tag = figure.find("figcaption")

                if not img_tag or not img_tag.get("src"):
                    print(f"⚠️ 第 {fig_index} 个 <figure> 无有效图片，跳过")
                    continue

                img_url = img_tag["src"]
                caption_text = caption_tag.get_text(separator=" ", strip=True) if caption_tag else "无描述"

                image_name = f"{index}_{fig_index}.jpg"
                image_path = os.path.join(output_folder, image_name)
                download_image(img_url, image_path)

                csv_name = f"{index}_{fig_index}.csv"
                csv_path = os.path.join(output_folder, csv_name)

                with open(csv_path, "w", encoding="utf-8", newline='') as csvfile:
                    writer = csv.DictWriter(csvfile, fieldnames=["image_name", "caption_text", "paper_title", "doi_link"])
                    writer.writeheader()
                    writer.writerow({
                        "image_name": image_name,
                        "caption_text": caption_text,
                        "paper_title": paper_title,
                        "doi_link": doi_link
                    })

                print(f"📄 CSV 已保存: {csv_path}")

        except Exception as e:
            print(f"❌ 处理 {url} 时出错: {e}")

    driver.quit()
    print("\n🎉 所有任务完成！")

if __name__ == '__main__':
    json_file = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/pubmed_central.json"
    output_folder = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed_central"

    extract_and_download_images(json_file, output_folder)



🔍 正在访问第 1 个链接: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC5426794
📄 文章标题: Semaphorin-1a preventsDrosophilaolfactory projection neuron dendrites from mis-targeting into select antennal lobe regions
🔗 DOI 链接: https://doi.org/10.1371/journal.pgen.1006751
✅ 下载成功: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed_central/1_1.jpg
📄 CSV 已保存: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed_central/1_1.csv
✅ 下载成功: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed_central/1_2.jpg
📄 CSV 已保存: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed_central/1_2.csv
✅ 下载成功: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed_central/1_3.jpg
📄 CSV 已保存: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed_central/1_3.csv
✅ 下载成功: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed_central/1_4.jpg
📄 CSV 已保存: /Users/zz/Desktop/reser/ruiming/pathology/glomerulus/test_pubmed_central/1_4.csv
✅ 下载成功: /Users/zz/Desk

KeyboardInterrupt: 