In [4]:
import os
import time
import requests
from xml.etree import ElementTree

def fetch_pubmed_xml(query, max_results=10, output_dir="output", batch_size=5, sleep_time=1):
    # Base URL for Entrez E-utilities
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

    # Step 1: Use ESearch to get the list of PubMed IDs (PMIDs) for the query
    search_url = f"{base_url}esearch.fcgi"
    search_params = {
        "db": "pubmed",
        "term": query,
        "retmax": max_results,
        "retmode": "xml"
    }

    search_response = requests.get(search_url, params=search_params)
    search_response.raise_for_status()
    search_xml = ElementTree.fromstring(search_response.content)

    # Extract PMIDs from the search results
    pmids = [id_elem.text for id_elem in search_xml.findall(".//Id")]

    if not pmids:
        print("No results found.")
        return

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Step 2: Fetch articles in batches to avoid rate limiting
    fetch_url = f"{base_url}efetch.fcgi"
    for i in range(0, len(pmids), batch_size):
        # Split PMIDs into batches
        batch_pmids = pmids[i:i + batch_size]
        fetch_params = {
            "db": "pubmed",
            "id": ",".join(batch_pmids),
            "retmode": "xml"
        }

        try:
            fetch_response = requests.get(fetch_url, params=fetch_params)
            fetch_response.raise_for_status()

            # Split fetched XML data and save each article by PMID
            articles = ElementTree.fromstring(fetch_response.content).findall(".//PubmedArticle")
            for article in articles:
                pmid = article.find(".//PMID").text
                file_path = os.path.join(output_dir, f"{pmid}.xml")

                with open(file_path, 'wb') as file:
                    file.write(ElementTree.tostring(article, encoding='utf-8'))

                print(f"Article {pmid} saved to {file_path}")

        except requests.exceptions.HTTPError as e:
            print(f"Failed to fetch batch {i+1} due to {e}")

        # Sleep to avoid hitting the rate limit
        time.sleep(sleep_time)

# Example usage:
query = "enterovirus"
max_results = 160  # Set the number of articles to download
output_directory = "data"  # Set your desired output directory
batch_size = 10  # Number of articles to fetch in each batch
sleep_time = 0.1  # Time in seconds to sleep between batches
fetch_pubmed_xml(query, max_results, output_directory, batch_size, sleep_time)

Article 39555929 saved to data\39555929.xml
Article 39552883 saved to data\39552883.xml
Article 39550578 saved to data\39550578.xml
Article 39548776 saved to data\39548776.xml
Article 39546098 saved to data\39546098.xml
Article 39546035 saved to data\39546035.xml
Article 39545343 saved to data\39545343.xml
Article 39536108 saved to data\39536108.xml
Article 39535185 saved to data\39535185.xml
Article 39532602 saved to data\39532602.xml
Article 39531817 saved to data\39531817.xml
Article 39531247 saved to data\39531247.xml
Article 39530331 saved to data\39530331.xml
Article 39516037 saved to data\39516037.xml
Article 39512378 saved to data\39512378.xml
Article 39512164 saved to data\39512164.xml
Article 39511383 saved to data\39511383.xml
Article 39509266 saved to data\39509266.xml
Article 39509140 saved to data\39509140.xml
Article 39506010 saved to data\39506010.xml
Article 39505825 saved to data\39505825.xml
Article 39502449 saved to data\39502449.xml
Article 39501367 saved to data\3