In [2]:
%pip install --upgrade pubmed-api

Note: you may need to restart the kernel to use updated packages.


In [1]:
import time
import json
from typing import Dict, List, Optional
from tqdm import tqdm
from pubmed_api import PubMedFetcher
import os
import pandas as pd

def fetch_all_pages(
    fetcher,
    query: str,
    max_pages: int = 5,
    results_per_page: int = 10,
    sleep_time: int = 3,
    output_file: Optional[str] = None
) -> Dict:
    """
    获取多页PubMed搜索结果并整合到一个字典中
    
    Args:
        fetcher: PubMedFetcher实例
        query: 搜索查询字符串
        max_pages: 最大获取页数
        results_per_page: 每页结果数0
        sleep_time: 页面间暂停时间(秒)
        output_file: 可选的输出JSON文件路径
    
    Returns:
        包含所有文章和元数据的字典
    """
    all_results = {
        "papers": [],
        "metadata": {}
    }
    
    try:
        # 获取第一页以获取总结果数
        first_page = fetcher.search(
            query=query,
            max_results=results_per_page,
            start=0
        )
        
        total_results = first_page["metadata"]["total_results"]
        total_pages = min(max_pages, (total_results + results_per_page - 1) // results_per_page)
        
        print(f"找到 {total_results} 篇文章，将获取 {total_pages} 页")
        
        # 添加第一页结果
        all_results["papers"].extend(first_page["papers"])
        all_results["metadata"] = {
            "total_results": total_results,
            "pages_retrieved": total_pages,
            "results_per_page": results_per_page,
            "query": query,
            "total_papers_retrieved": len(first_page["papers"])
        }
        
        # 获取剩余页面
        if total_pages > 1:
            with tqdm(range(1, total_pages), desc="获取页面") as pbar:
                for page in pbar:
                    time.sleep(sleep_time)  # 在请求之间暂停
                    
                    start_index = page * results_per_page
                    try:
                        result = fetcher.search(
                            query=query,
                            max_results=results_per_page,
                            start=start_index
                        )
                        
                        all_results["papers"].extend(result["papers"])
                        all_results["metadata"]["total_papers_retrieved"] = len(all_results["papers"])
                        
                        pbar.set_postfix({
                            "已获取文章": len(all_results["papers"]),
                            "当前页文章数": len(result["papers"])
                        })
                        
                    except Exception as e:
                        print(f"\n获取第 {page + 1} 页时出错: {str(e)}")
                        continue
        
        # 如果指定了输出文件，保存结果
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(all_results, f, ensure_ascii=False, indent=2)
            print(f"\n结果已保存至: {output_file}")
        
        return all_results
        
    except Exception as e:
        print(f"获取过程中出错: {str(e)}")
        return all_results

def format_results(paper):    
    temp_df = pd.DataFrame()
    temp_df['title'] = [paper['title']]
    temp_df['pmid'] = [paper['pmid']]
    temp_df['journal'] = [paper['journal']['title']]
    authors = ""
    for author in paper['authors']:
        authors += f"{author['fore_name']} {author['last_name']}, "
        if author['affiliations']:
            temp_df['affiliations'] = [author['affiliations'][0]]
        else:
            temp_df['affiliations'] = [None]
    temp_df['authors'] = [authors]
    if paper['abstract']['structured']:
        for section, text in paper['abstract']['sections'].items():
            temp_df[section] = [text]
    else:
        temp_df['abstract'] = [paper['abstract']['complete']]
    
    if paper['keywords']:
        temp_df['keywords'] = [", ".join(paper['keywords'])]
    else:
        temp_df['keywords'] = [None]
    
    for url_type, url in paper['urls'].items():
        if url:
            temp_df[url_type] = [url]
    if paper['metadata']:
        temp_df['metadata'] = [paper['metadata']]
    else:
        temp_df['metadata'] = [None]
    
    temp_df['is_open_access'] = [paper['metadata']['is_open_access']]

    temp_df['fetch_time'] = [paper['metadata']['fetch_time']]
    return temp_df

query = "Mendelian randomization"
# 使用示例
fetcher = PubMedFetcher(api_key=os.getenv("PUBMED_API_KEY"))

results = fetch_all_pages(
    fetcher,
    query=query,
    max_pages=1,
    results_per_page=5,
    sleep_time=3,
    output_file="pubmed_results.json"
)

print(f"\n共获取 {len(results['papers'])} 篇文章")

papers_df = pd.DataFrame()
for i,paper in enumerate(results["papers"]):
    paper_df = format_results(paper)
    papers_df = pd.concat([papers_df, paper_df], ignore_index=True)
    if i % 10 == 0:
        papers_df.to_csv(f"data/{query}_papers_df.csv", index=False)
papers_df.to_csv(f"data/{query}_papers_df.csv", index=False)

INFO:numexpr.utils:Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
2025-04-28 00:00:07,323 - pubmed_api - INFO - 找到 5 篇文章
INFO:pubmed_api:找到 5 篇文章


找到 16827 篇文章，将获取 1 页

结果已保存至: pubmed_results.json

共获取 5 篇文章


In [3]:
results

{'papers': [{'pmid': '37935836',
   'title': 'Mendelian randomization for cardiovascular diseases: principles and applications.',
   'authors': [{'last_name': 'Larsson',
     'fore_name': 'Susanna C',
     'affiliations': ['Unit of Medical Epidemiology, Department of Surgical Sciences, Uppsala University, Uppsala, Sweden.',
      'Unit of Cardiovascular and Nutritional Epidemiology, Institute of Environmental Medicine, Karolinska Institutet, Stockholm, Sweden.']},
    {'last_name': 'Butterworth',
     'fore_name': 'Adam S',
     'affiliations': ['British Heart Foundation Cardiovascular Epidemiology Unit, Department of Public Health and Primary Care, University of Cambridge, Cambridge, UK.',
      'Victor Phillip Dahdaleh Heart and Lung Research Institute, University of Cambridge, Papworth Road, Cambridge, UK.',
      "British Heart Foundation Centre of Research Excellence, School of Clinical Medicine, Addenbrooke's Hospital, University of Cambridge, Cambridge, UK.",
      'Health Data 