In [2]:
import os
import time
import requests
from xml.etree import ElementTree

def fetch_pubmed_xml(query, max_results=10, output_dir="output", batch_size=5, sleep_time=1):
    # Base URL for Entrez E-utilities
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

    # Step 1: Use ESearch to get the list of PubMed IDs (PMIDs) for the query
    search_url = f"{base_url}esearch.fcgi"
    search_params = {
        "db": "pubmed",
        "term": query,
        "retmax": max_results,
        "retmode": "xml"
    }

    search_response = requests.get(search_url, params=search_params)
    search_response.raise_for_status()
    search_xml = ElementTree.fromstring(search_response.content)

    # Extract PMIDs from the search results
    pmids = [id_elem.text for id_elem in search_xml.findall(".//Id")]

    if not pmids:
        print("No results found.")
        return

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Step 2: Fetch articles in batches to avoid rate limiting
    fetch_url = f"{base_url}efetch.fcgi"
    for i in range(0, len(pmids), batch_size):
        # Split PMIDs into batches
        batch_pmids = pmids[i:i + batch_size]
        fetch_params = {
            "db": "pubmed",
            "id": ",".join(batch_pmids),
            "retmode": "xml"
        }

        try:
            fetch_response = requests.get(fetch_url, params=fetch_params)
            fetch_response.raise_for_status()

            # Split fetched XML data and save each article by PMID
            articles = ElementTree.fromstring(fetch_response.content).findall(".//PubmedArticle")
            for article in articles:
                pmid = article.find(".//PMID").text
                file_path = os.path.join(output_dir, f"{pmid}.xml")

                with open(file_path, 'wb') as file:
                    file.write(ElementTree.tostring(article, encoding='utf-8'))

                print(f"Article {pmid} saved to {file_path}")

        except requests.exceptions.HTTPError as e:
            print(f"Failed to fetch batch {i+1} due to {e}")

        # Sleep to avoid hitting the rate limit
        time.sleep(sleep_time)

# Example usage:
query = "food safety"
max_results = 3000  # Set the number of articles to download
output_directory = "food_safety"  # Set your desired output directory
batch_size = 10  # Number of articles to fetch in each batch
sleep_time = 0.1  # Time in seconds to sleep between batches
fetch_pubmed_xml(query, max_results, output_directory, batch_size, sleep_time)

Article 39381961 saved to food_safety\39381961.xml
Article 39381905 saved to food_safety\39381905.xml
Article 39381832 saved to food_safety\39381832.xml
Article 39381600 saved to food_safety\39381600.xml
Article 39381597 saved to food_safety\39381597.xml
Article 39381350 saved to food_safety\39381350.xml
Article 39381230 saved to food_safety\39381230.xml
Article 39381062 saved to food_safety\39381062.xml
Article 39381060 saved to food_safety\39381060.xml
Article 39381012 saved to food_safety\39381012.xml
Failed to fetch batch 11 due to 400 Client Error: Bad Request for url: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=39380968%2C39380681%2C39380497%2C39380384%2C39380255%2C39380120%2C39380046%2C39379900%2C39379899%2C39379818&retmode=xml
Article 39379781 saved to food_safety\39379781.xml
Article 39379669 saved to food_safety\39379669.xml
Article 39379449 saved to food_safety\39379449.xml
Article 39379312 saved to food_safety\39379312.xml
Article 39379311 saved t

KeyboardInterrupt: 