In [None]:
import os
import time
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

In [None]:
import os
import json
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

def extract_href_from_links(csv_file_path, output_json):
    """从 CSV 读取 URL，提取 'PubMed Central' 链接，并保存为 JSON 文件"""

    # 1. 读取 CSV 文件
    df = pd.read_csv(csv_file_path)
    urls = df["Links"].dropna().tolist()  # 去除空值，转换为列表

    # 2. 配置 Selenium（无头模式）
    options = Options()
    options.add_argument('--headless')  # 无头模式
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920x1080')

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    # 3. 存储提取的数据
    extracted_data = []

    # 4. 遍历每个 URL
    for i, url in enumerate(urls, start=1):
        try:
            print(f"正在处理第 {i} 个链接: {url}")
            driver.get(url)

            # 使用 WebDriverWait 等待 PubMed Central 链接出现
            pubmed_link_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//a[contains(text(), "PubMed Central")]'))
            )

            # 提取 href
            extracted_href = pubmed_link_element.get_attribute("href")
            print(f"✅ 第 {i} 个链接提取的 PubMed Central href: {extracted_href}")

        except Exception as e:
            print(f"⚠️ 处理第 {i} 个链接时出错: {e}")
            extracted_href = None  # 如果出错，存储 None

        # 存储原始链接和提取的链接
        extracted_data.append({"Original_Link": url, "Extracted_Href": extracted_href})

    # 5. 关闭浏览器
    driver.quit()

    # 6. 确保输出 JSON 目录存在
    os.makedirs(os.path.dirname(output_json), exist_ok=True)

    # 7. 保存数据到 JSON 文件
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(extracted_data, f, indent=4, ensure_ascii=False)  # 格式化 JSON，美观输出

    print(f"✅ 处理完成，结果已保存到 {output_json}")

if __name__ == '__main__':
    csv_file_path = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/all_links.csv"
    output_json = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/pubmed_links.json"

    extract_href_from_links(csv_file_path, output_json)


正在处理第 1 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC5426794_pgen.1006751.g004&query=glomerulus&it=xg&req=4&npos=1
✅ 第 1 个链接提取的 PubMed Central href: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC5426794
正在处理第 2 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC3412803_pone.0042814.g001&query=glomerulus&it=xg&req=4&npos=2
✅ 第 2 个链接提取的 PubMed Central href: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3412803
正在处理第 3 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC4928931_pone.0157497.g003&query=glomerulus&it=xg&req=4&npos=3
✅ 第 3 个链接提取的 PubMed Central href: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4928931
正在处理第 4 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC5133832_ijms-17-01831-g005&query=glomerulus&it=xg&req=4&npos=4
✅ 第 4 个链接提取的 PubMed Central href: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC5133832
正在处理第 5 个链接: https://openi.nlm.nih.gov/detailedresult?img=PMC4398543_pone.0116700.g001&query=glomerulus&it=xg&req=4&npos=5
✅ 第 5 个链接提取的 PubMed Central href: http:/

In [None]:
import os
import json
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

def download_image(image_url, save_path):
    """下载并保存图片"""
    try:
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print(f"✅ 下载成功: {save_path}")
        else:
            print(f"⚠️ 无法下载: {image_url}")
    except Exception as e:
        print(f"❌ 下载失败: {image_url} - {e}")

def extract_and_download_images(json_file, output_folder):
    """从 JSON 读取链接，爬取所有符合条件的图片并下载"""
    
    # 读取 JSON 文件
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # 创建保存目录
    os.makedirs(output_folder, exist_ok=True)

    # 配置 Selenium（无头模式）
    options = Options()
    options.add_argument('--headless')  # 无头模式
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920x1080')

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    # 遍历 JSON 数据
    for index, item in enumerate(data, start=1):
        url = item.get("Extracted_Href")
        if not url:
            print(f"❌ 跳过 {index}，因为没有可用的链接")
            continue
        
        try:
            print(f"🔍 访问第 {index} 个链接: {url}")
            driver.get(url)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # **优先查找指定 class 内的 img 标签（多个）**
            images = soup.select('.img-box.line-height-none.margin-x-neg-2.tablet\\:margin-x-0.text-center img')
            image_urls = [img['src'] for img in images if img.get('src')]

            # **如果找不到指定 class，查找所有 .jpg 结尾的图片**
            if not image_urls:
                print(f"⚠️ 未找到指定 class 的图片，在整个页面中搜索 .jpg 图片...")
                image_urls = [img['src'] for img in soup.find_all('img') if img.get('src', '').endswith('.jpg')]

            # **下载所有找到的图片**
            for img_index, img_url in enumerate(image_urls, start=1):
                image_name = f"{index}_{img_index}.jpg"
                image_path = os.path.join(output_folder, image_name)
                download_image(img_url, image_path)

        except Exception as e:
            print(f"❌ 处理 {url} 时出错: {e}")

    # 关闭浏览器
    driver.quit()
    print("🎉 所有任务完成！")

if __name__ == '__main__':
    json_file = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/pubmed_links.json"
    output_folder = "/Users/zz/Desktop/reser/ruiming/pathology/glomerulus/images"

    extract_and_download_images(json_file, output_folder)
