# 旧版爬虫(臃肿无效)

In [1]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup, NavigableString, Tag
from urllib.parse import urljoin, urlparse
import concurrent.futures
import threading
import logging
import json
import os
from tqdm.notebook import tqdm
import time

# --- 配置 ---
START_URL = "https://webpath.med.utah.edu/ORGAN.html"
OUTPUT_FILE = "webpath_system_path_archive.json"
MAX_WORKERS = 30
RETRY_COUNT = 3
RETRY_BACKOFF = 0.5

# --- 日志配置 ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(threadName)s - %(levelname)s - %(message)s')

# --- 线程安全的数据结构 ---
# 这个集合将包含所有已知URL(无论成功失败)，用于防止重复添加任务
all_known_urls = set() 
# 这个列表只存储本次运行产生的新结果
current_run_results = []
# 统一的锁
lock = threading.Lock()

# --- 核心功能函数 (与之前版本相同) ---
def setup_session_with_retries():
    """创建一个带有自动重试功能和自定义连接池大小的requests Session对象"""
    session = requests.Session()
    
    # 1. 定义重试策略
    retry_strategy = Retry(
        total=RETRY_COUNT,
        backoff_factor=RETRY_BACKOFF,
        status_forcelist=[500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    
    # 2. 创建一个适配器 (Adapter)，并在这里配置连接池！
    #    pool_connections: 控制连接池的总数 (通常设为与workers数一致即可)
    #    pool_maxsize: 控制每个连接池中的最大连接数。这是解决问题的关键！
    adapter = HTTPAdapter(
        pool_connections=MAX_WORKERS, 
        pool_maxsize=MAX_WORKERS, 
        max_retries=retry_strategy
    )
    
    # 3. 将配置好的适配器挂载到session上
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    # 4. 设置浏览器User-Agent
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    })
    
    return session

def html_to_json_structure(element):
    if isinstance(element, NavigableString):
        text = element.strip()
        return text if text else None
    if isinstance(element, Tag):
        obj = {"tag": element.name, "attrs": element.attrs, "children": []}
        for child in element.contents:
            child_json = html_to_json_structure(child)
            if child_json: obj["children"].append(child_json)
        if not obj["children"]: del obj["children"]
        return obj
    return None

# --- 断点续爬的核心逻辑 ---
def load_and_prepare_state():
    """
    加载本地JSON文件，分离出成功的、失败的，并准备续爬任务。
    返回:
        - tasks_to_retry (set): 需要重新爬取的URL集合。
        - successful_data (dict): {url: data} 形式的、已成功爬取的数据字典。
    """
    if not os.path.exists(OUTPUT_FILE):
        logging.info(f"No previous data file found. Starting a new crawl.")
        return set(), {}
        
    logging.info(f"Found existing data file '{OUTPUT_FILE}'. Loading state...")
    try:
        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
            previous_data = json.load(f)
        
        tasks_to_retry = set()
        successful_data = {}
        
        for item in previous_data:
            url = item.get('url')
            if not url: continue
            
            # 记录所有已知的URL
            all_known_urls.add(url)
            
            # 分类：失败的进重试队列，成功的进保留字典
            if item.get('status_code') == 'Error' or item.get('error'):
                tasks_to_retry.add(url)
            else:
                successful_data[url] = item
        
        logging.info(f"Loaded {len(all_known_urls)} known URLs. "
                     f"Found {len(successful_data)} successful pages and {len(tasks_to_retry)} pages to retry.")
        return tasks_to_retry, successful_data
        
    except (IOError, json.JSONDecodeError) as e:
        logging.error(f"Could not load or parse previous data file: {e}. Starting fresh.")
        return set(), {}

def crawl_page(url, session):
    """爬取单个页面，逻辑与之前基本一致"""
    try:
        logging.info(f"Processing -> {url}")
        response = session.get(url, timeout=15)
        response.raise_for_status()

        page_data = {
            'url': url, 'status_code': response.status_code,
            'content_type': response.headers.get('content-type', '').lower(),
            'crawl_timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()),
            'is_leaf_node': True, 'html_structure': None, 'error': None
        }

        new_urls_to_crawl = []
        if 'text/html' in page_data['content_type']:
            soup = BeautifulSoup(response.text, 'html.parser')
            page_data['html_structure'] = html_to_json_structure(soup)
            
            base_domain = urlparse(START_URL).netloc
            for link in soup.find_all('a', href=True):
                absolute_url = urljoin(url, link['href']).split('#')[0]
                if urlparse(absolute_url).netloc == base_domain:
                    new_urls_to_crawl.append(absolute_url)
        
        # 判断是否为叶子节点 (检查新链接是否全都是已知链接)
        if any(u not in all_known_urls for u in new_urls_to_crawl):
            page_data['is_leaf_node'] = False

    except requests.RequestException as e:
        logging.error(f"Failed to fetch {url} after retries: {e}")
        page_data = {
            'url': url, 'status_code': 'Error', 'error': str(e),
            'crawl_timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
        }
        new_urls_to_crawl = []

    with lock:
        current_run_results.append(page_data)
        
    return new_urls_to_crawl

# --- 主执行逻辑 ---
def main():
    start_time = time.time()
    
    # 1. 加载旧状态，并准备初始任务
    tasks_to_retry, successful_data = load_and_prepare_state()
    tasks_queue = set(tasks_to_retry)
    
    # 如果起始URL从未被处理过，则加入任务队列
    if START_URL not in all_known_urls:
        tasks_queue.add(START_URL)

    if not tasks_queue:
        logging.info("No new or failed URLs to process. Exiting.")
        return

    session = setup_session_with_retries()
    
    # 2. 使用线程池进行并发处理
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        with tqdm(total=len(tasks_queue), desc="Processing Pages") as pbar:
            while tasks_queue:
                future_to_url = {executor.submit(crawl_page, url, session): url for url in tasks_queue}
                tasks_queue.clear() # 清空当前批次
                
                for future in concurrent.futures.as_completed(future_to_url):
                    pbar.update(1)
                    new_links = future.result()
                    
                    with lock:
                        for link in new_links:
                            if link not in all_known_urls:
                                all_known_urls.add(link)
                                tasks_queue.add(link)
                                pbar.total += 1 # 动态增加进度条
        
    logging.info(f"Crawl finished for this session. Processed {len(current_run_results)} pages.")

    # 3. 合并新旧数据并保存
    logging.info(f"Merging new results with {len(successful_data)} previously successful pages...")
    
    # 将本次运行的结果也变成一个字典，方便合并
    # 如果一个URL被重试成功，新结果会覆盖旧的失败记录
    for item in current_run_results:
        successful_data[item['url']] = item

    # 将合并后的字典的值转换为列表，准备写入文件
    final_data = list(successful_data.values())
    
    logging.info(f"Total pages in archive: {len(final_data)}. Saving to {OUTPUT_FILE}...")
    try:
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(final_data, f, indent=2, ensure_ascii=False)
        logging.info("Successfully updated data file.")
    except IOError as e:
        logging.error(f"Failed to save file: {e}")

    end_time = time.time()
    print("-" * 50)
    print(f"Total time for this session: {end_time - start_time:.2f} seconds")
    print(f"Total pages in final archive: {len(final_data)}")
    print(f"Data saved to: {OUTPUT_FILE}")
    print("-" * 50)

# --- 运行主函数 ---
main()

2025-10-26 01:57:36,037 - MainThread - INFO - No previous data file found. Starting a new crawl.


Processing Pages:   0%|          | 0/1 [00:00<?, ?it/s]

2025-10-26 01:57:36,065 - ThreadPoolExecutor-0_0 - INFO - Processing -> https://webpath.med.utah.edu/ORGAN.html
2025-10-26 01:57:36,902 - ThreadPoolExecutor-0_1 - INFO - Processing -> https://webpath.med.utah.edu/HEMEHTML/HEMEIDX.html
2025-10-26 01:57:36,902 - ThreadPoolExecutor-0_0 - INFO - Processing -> https://webpath.med.utah.edu/LUNGHTML/LUNGIDX.html
2025-10-26 01:57:36,902 - ThreadPoolExecutor-0_2 - INFO - Processing -> https://webpath.med.utah.edu/CNSHTML/CNSIDX.html
2025-10-26 01:57:36,903 - ThreadPoolExecutor-0_3 - INFO - Processing -> https://webpath.med.utah.edu/GIHTML/GIIDX.html
2025-10-26 01:57:36,903 - ThreadPoolExecutor-0_4 - INFO - Processing -> https://webpath.med.utah.edu/MALEHTML/MALEIDX.html
2025-10-26 01:57:36,903 - ThreadPoolExecutor-0_5 - INFO - Processing -> https://webpath.med.utah.edu/RENAHTML/RENALIDX.html
2025-10-26 01:57:36,903 - ThreadPoolExecutor-0_6 - INFO - Processing -> https://webpath.med.utah.edu/FEMHTML/FEMIDX.html
2025-10-26 01:57:36,903 - ThreadPo

--------------------------------------------------
Total time for this session: 598.81 seconds
Total pages in final archive: 3005
Data saved to: webpath_system_path_archive.json
--------------------------------------------------


# 新版

In [1]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup, NavigableString, Tag
from urllib.parse import urljoin, urlparse
import concurrent.futures
import threading
import logging
import json
import os
from tqdm.notebook import tqdm
import time

# --- 配置 ---
START_URL = "https://webpath.med.utah.edu/ORGAN.html"
OUTPUT_FILE = "webpath_image_data_robust.json" # 新文件名
MAX_WORKERS = 30
RETRY_COUNT = 3
RETRY_BACKOFF = 0.5

# --- 日志配置 ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(threadName)s - %(levelname)s - %(message)s')

# --- 线程安全的数据结构 ---
all_known_urls = set()
current_run_results = []
lock = threading.Lock()

# --- 核心功能函数 ---

def setup_session_with_retries():
    """创建一个带有自动重试和自定义连接池的Session"""
    session = requests.Session()
    retry_strategy = Retry(
        total=RETRY_COUNT, backoff_factor=RETRY_BACKOFF,
        status_forcelist=[500, 502, 503, 504], allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(pool_connections=MAX_WORKERS, pool_maxsize=MAX_WORKERS, max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    })
    return session

def extract_image_data_robust(html_content, base_url):
    """
    【核心改进】稳健地提取页面中的图片区块。
    一个区块由一张jpeg图片和其后的所有相关描述文本组成。
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    # 如果页面没有body，则直接返回
    body = soup.find('body')
    if not body:
        return []

    records = []
    current_record = None

    # 遍历body标签的所有直接子元素
    for element in body.children:
        if not isinstance(element, Tag):
            continue  # 跳过非标签元素，如换行符

        # 检查是否是一个新的图片区块的开始
        if element.name == 'img' and 'jpeg' in element.get('src', '').lower():
            # 如果我们正在处理上一个区块，先保存它
            if current_record:
                records.append(current_record)
            
            # 开始一个新的区块记录
            current_record = {
                'image_url': urljoin(base_url, element['src']),
                'descriptions': []
            }
        
        # 如果我们已经在一个图片区块内，检查当前元素是否是描述性表格
        elif element.name == 'table' and current_record:
            # 智能判断：如果一个表格包含导航GIF，它就不是描述性表格
            is_navigation_table = element.find('img', src=lambda s: s and '.gif' in s.lower())
            
            if not is_navigation_table:
                description_text = element.get_text(separator=' ', strip=True)
                if description_text:  # 确保不添加空描述
                    current_record['descriptions'].append(description_text)
    
    # 不要忘记保存循环结束后的最后一个区块
    if current_record:
        records.append(current_record)
        
    return records


def load_and_prepare_state():
    """加载本地JSON文件，准备续爬任务 (无变化)"""
    if not os.path.exists(OUTPUT_FILE):
        logging.info("No previous data file found. Starting a new crawl.")
        return set(), {}
    logging.info(f"Found existing data file '{OUTPUT_FILE}'. Loading state...")
    try:
        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
            previous_data = json.load(f)
        tasks_to_retry = set()
        successful_data = {}
        for item in previous_data:
            url = item.get('url')
            if not url: continue
            all_known_urls.add(url)
            if item.get('status_code') == 'Error' or item.get('error'):
                tasks_to_retry.add(url)
            else: successful_data[url] = item
        logging.info(f"Loaded {len(all_known_urls)} URLs. Found {len(successful_data)} successful pages and {len(tasks_to_retry)} to retry.")
        return tasks_to_retry, successful_data
    except (IOError, json.JSONDecodeError) as e:
        logging.error(f"Could not load or parse previous data file: {e}. Starting fresh.")
        return set(), {}


def crawl_page(url, session):
    """爬取单个页面，使用稳健的提取逻辑，并返回新链接 (调用新函数)"""
    try:
        response = session.get(url, timeout=15)
        response.raise_for_status()
        html_content = response.text
        
        extracted_items = []
        if 'jpeg' in html_content.lower():
            # 调用新的、更稳健的提取函数
            extracted_items = extract_image_data_robust(html_content, url)

        page_data = {
            'url': url,
            'status_code': response.status_code,
            'extracted_data': extracted_items,
            'error': None
        }

        soup = BeautifulSoup(html_content, 'html.parser')
        new_urls_to_crawl = []
        base_domain = urlparse(START_URL).netloc
        for link in soup.find_all('a', href=True):
            absolute_url = urljoin(url, link['href']).split('#')[0]
            if urlparse(absolute_url).netloc == base_domain:
                new_urls_to_crawl.append(absolute_url)

    except requests.RequestException as e:
        logging.error(f"Failed to fetch {url} after retries: {e}")
        page_data = {'url': url, 'status_code': 'Error', 'error': str(e), 'extracted_data': []}
        new_urls_to_crawl = []

    with lock:
        current_run_results.append(page_data)
        
    return new_urls_to_crawl


# --- 主执行逻辑 (无变化) ---
def main():
    start_time = time.time()
    tasks_to_retry, successful_data = load_and_prepare_state()
    tasks_queue = set(tasks_to_retry)
    if START_URL not in all_known_urls:
        tasks_queue.add(START_URL)

    if not tasks_queue:
        logging.info("No new or failed URLs to process. The data is up-to-date.")
        return

    session = setup_session_with_retries()
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        with tqdm(total=len(tasks_queue), desc="Processing Pages") as pbar:
            while tasks_queue:
                future_to_url = {executor.submit(crawl_page, url, session): url for url in tasks_queue}
                tasks_queue.clear()
                
                for future in concurrent.futures.as_completed(future_to_url):
                    pbar.update(1)
                    new_links = future.result()
                    with lock:
                        for link in new_links:
                            if link not in all_known_urls:
                                all_known_urls.add(link)
                                tasks_queue.add(link)
                                pbar.total += 1
        
    logging.info(f"Crawl finished for this session. Processed {len(current_run_results)} pages.")

    # 合并新旧数据并保存
    logging.info(f"Merging new results with {len(successful_data)} previously processed pages...")
    for item in current_run_results:
        successful_data[item['url']] = item

    final_data = [item for item in successful_data.values() if item.get('extracted_data') or item.get('error')]
    
    logging.info(f"Found {len(final_data)} pages with relevant data or errors. Saving to {OUTPUT_FILE}...")
    try:
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(final_data, f, indent=2, ensure_ascii=False)
        logging.info("Successfully updated data file.")
    except IOError as e:
        logging.error(f"Failed to save file: {e}")

    end_time = time.time()
    print("-" * 50)
    print(f"Total time for this session: {end_time - start_time:.2f} seconds")
    print(f"Total pages with data/errors in final archive: {len(final_data)}")
    print(f"Data saved to: {OUTPUT_FILE}")
    print("-" * 50)

# --- 运行主函数 ---
main()

2025-10-26 02:18:29,639 - MainThread - INFO - No previous data file found. Starting a new crawl.


Processing Pages:   0%|          | 0/1 [00:00<?, ?it/s]

2025-10-26 02:22:31,205 - ThreadPoolExecutor-0_1 - ERROR - Failed to fetch https://webpath.med.utah.edu/HISTHTML/ANATOMY/anatquiz/quizaidx.htm after retries: 404 Client Error: Not Found for url: https://webpath.med.utah.edu/HISTHTML/ANATOMY/anatquiz/quizaidx.htm
2025-10-26 02:24:19,809 - MainThread - INFO - Crawl finished for this session. Processed 3006 pages.
2025-10-26 02:24:19,811 - MainThread - INFO - Merging new results with 0 previously processed pages...
2025-10-26 02:24:19,815 - MainThread - INFO - Found 2389 pages with relevant data or errors. Saving to webpath_image_data_robust.json...
2025-10-26 02:24:19,852 - MainThread - INFO - Successfully updated data file.


--------------------------------------------------
Total time for this session: 350.21 seconds
Total pages with data/errors in final archive: 2389
Data saved to: webpath_image_data_robust.json
--------------------------------------------------


In [2]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup, NavigableString, Tag
from urllib.parse import urljoin, urlparse
import concurrent.futures
import threading
import logging
import json
import os
from tqdm.notebook import tqdm
import time

# --- 配置 ---
START_URL = "https://webpath.med.utah.edu/ORGAN.html"
OUTPUT_FILE = "webpath_image_data_robust.json" # 新文件名
MAX_WORKERS = 30
RETRY_COUNT = 3
RETRY_BACKOFF = 0.5

# --- 日志配置 ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(threadName)s - %(levelname)s - %(message)s')

# --- 线程安全的数据结构 ---
all_known_urls = set()
current_run_results = []
lock = threading.Lock()

# --- 核心功能函数 ---

def setup_session_with_retries():
    """创建一个带有自动重试和自定义连接池的Session"""
    session = requests.Session()
    retry_strategy = Retry(
        total=RETRY_COUNT, backoff_factor=RETRY_BACKOFF,
        status_forcelist=[500, 502, 503, 504], allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(pool_connections=MAX_WORKERS, pool_maxsize=MAX_WORKERS, max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    })
    return session

def extract_image_data_robust(html_content, base_url):
    """
    【核心改进】稳健地提取页面中的图片区块。
    一个区块由一张jpeg图片和其后的所有相关描述文本组成。
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    # 如果页面没有body，则直接返回
    body = soup.find('body')
    if not body:
        return []

    records = []
    current_record = None

    # 遍历body标签的所有直接子元素
    for element in body.children:
        if not isinstance(element, Tag):
            continue  # 跳过非标签元素，如换行符

        # 检查是否是一个新的图片区块的开始
        if element.name == 'img' and 'jpeg' in element.get('src', '').lower():
            # 如果我们正在处理上一个区块，先保存它
            if current_record:
                records.append(current_record)
            
            # 开始一个新的区块记录
            current_record = {
                'image_url': urljoin(base_url, element['src']),
                'descriptions': []
            }
        
        # 如果我们已经在一个图片区块内，检查当前元素是否是描述性表格
        elif element.name == 'table' and current_record:
            # 智能判断：如果一个表格包含导航GIF，它就不是描述性表格
            is_navigation_table = element.find('img', src=lambda s: s and '.gif' in s.lower())
            
            if not is_navigation_table:
                description_text = element.get_text(separator=' ', strip=True)
                if description_text:  # 确保不添加空描述
                    current_record['descriptions'].append(description_text)
    
    # 不要忘记保存循环结束后的最后一个区块
    if current_record:
        records.append(current_record)
        
    return records


def load_and_prepare_state():
    """加载本地JSON文件，准备续爬任务 (无变化)"""
    if not os.path.exists(OUTPUT_FILE):
        logging.info("No previous data file found. Starting a new crawl.")
        return set(), {}
    logging.info(f"Found existing data file '{OUTPUT_FILE}'. Loading state...")
    try:
        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
            previous_data = json.load(f)
        tasks_to_retry = set()
        successful_data = {}
        for item in previous_data:
            url = item.get('url')
            if not url: continue
            all_known_urls.add(url)
            if item.get('status_code') == 'Error' or item.get('error'):
                tasks_to_retry.add(url)
            else: successful_data[url] = item
        logging.info(f"Loaded {len(all_known_urls)} URLs. Found {len(successful_data)} successful pages and {len(tasks_to_retry)} to retry.")
        return tasks_to_retry, successful_data
    except (IOError, json.JSONDecodeError) as e:
        logging.error(f"Could not load or parse previous data file: {e}. Starting fresh.")
        return set(), {}


def crawl_page(url, session):
    """爬取单个页面，使用稳健的提取逻辑，并返回新链接 (调用新函数)"""
    try:
        response = session.get(url, timeout=15)
        response.raise_for_status()
        html_content = response.text
        
        extracted_items = []
        if 'jpeg' in html_content.lower():
            # 调用新的、更稳健的提取函数
            extracted_items = extract_image_data_robust(html_content, url)

        page_data = {
            'url': url,
            'status_code': response.status_code,
            'extracted_data': extracted_items,
            'error': None
        }

        soup = BeautifulSoup(html_content, 'html.parser')
        new_urls_to_crawl = []
        base_domain = urlparse(START_URL).netloc
        for link in soup.find_all('a', href=True):
            absolute_url = urljoin(url, link['href']).split('#')[0]
            if urlparse(absolute_url).netloc == base_domain:
                new_urls_to_crawl.append(absolute_url)

    except requests.RequestException as e:
        logging.error(f"Failed to fetch {url} after retries: {e}")
        page_data = {'url': url, 'status_code': 'Error', 'error': str(e), 'extracted_data': []}
        new_urls_to_crawl = []

    with lock:
        current_run_results.append(page_data)
        
    return new_urls_to_crawl


# --- 主执行逻辑 (无变化) ---
def main():
    start_time = time.time()
    tasks_to_retry, successful_data = load_and_prepare_state()
    tasks_queue = set(tasks_to_retry)
    if START_URL not in all_known_urls:
        tasks_queue.add(START_URL)

    if not tasks_queue:
        logging.info("No new or failed URLs to process. The data is up-to-date.")
        return

    session = setup_session_with_retries()
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        with tqdm(total=len(tasks_queue), desc="Processing Pages") as pbar:
            while tasks_queue:
                future_to_url = {executor.submit(crawl_page, url, session): url for url in tasks_queue}
                tasks_queue.clear()
                
                for future in concurrent.futures.as_completed(future_to_url):
                    pbar.update(1)
                    new_links = future.result()
                    with lock:
                        for link in new_links:
                            if link not in all_known_urls:
                                all_known_urls.add(link)
                                tasks_queue.add(link)
                                pbar.total += 1
        
    logging.info(f"Crawl finished for this session. Processed {len(current_run_results)} pages.")

    # 合并新旧数据并保存
    logging.info(f"Merging new results with {len(successful_data)} previously processed pages...")
    for item in current_run_results:
        successful_data[item['url']] = item

    final_data = [item for item in successful_data.values() if item.get('extracted_data') or item.get('error')]
    
    logging.info(f"Found {len(final_data)} pages with relevant data or errors. Saving to {OUTPUT_FILE}...")
    try:
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(final_data, f, indent=2, ensure_ascii=False)
        logging.info("Successfully updated data file.")
    except IOError as e:
        logging.error(f"Failed to save file: {e}")

    end_time = time.time()
    print("-" * 50)
    print(f"Total time for this session: {end_time - start_time:.2f} seconds")
    print(f"Total pages with data/errors in final archive: {len(final_data)}")
    print(f"Data saved to: {OUTPUT_FILE}")
    print("-" * 50)

# --- 运行主函数 ---
main()

2025-10-26 02:28:00,344 - MainThread - INFO - Found existing data file 'webpath_image_data_robust.json'. Loading state...
2025-10-26 02:28:00,356 - MainThread - INFO - Loaded 2389 URLs. Found 2388 successful pages and 1 to retry.


Processing Pages:   0%|          | 0/2 [00:00<?, ?it/s]

2025-10-26 02:28:01,855 - ThreadPoolExecutor-1_0 - ERROR - Failed to fetch https://webpath.med.utah.edu/HISTHTML/ANATOMY/anatquiz/quizaidx.htm after retries: 404 Client Error: Not Found for url: https://webpath.med.utah.edu/HISTHTML/ANATOMY/anatquiz/quizaidx.htm
2025-10-26 02:29:58,009 - MainThread - INFO - Crawl finished for this session. Processed 579 pages.
2025-10-26 02:29:58,010 - MainThread - INFO - Merging new results with 2388 previously processed pages...
2025-10-26 02:29:58,012 - MainThread - INFO - Found 2389 pages with relevant data or errors. Saving to webpath_image_data_robust.json...
2025-10-26 02:29:58,049 - MainThread - INFO - Successfully updated data file.


--------------------------------------------------
Total time for this session: 117.71 seconds
Total pages with data/errors in final archive: 2389
Data saved to: webpath_image_data_robust.json
--------------------------------------------------
