In [7]:
import os
import time
import requests
from xml.etree import ElementTree

def fetch_pubmed_xml(query, max_results=10, output_dir="output", batch_size=5, sleep_time=1):
    # Base URL for Entrez E-utilities
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

    # Step 1: Use ESearch to get the list of PubMed IDs (PMIDs) for the query
    search_url = f"{base_url}esearch.fcgi"
    search_params = {
        "db": "pubmed",
        "term": query,
        "retmax": max_results,
        "retmode": "xml"
    }

    search_response = requests.get(search_url, params=search_params)
    search_response.raise_for_status()
    search_xml = ElementTree.fromstring(search_response.content)

    # Extract PMIDs from the search results
    pmids = [id_elem.text for id_elem in search_xml.findall(".//Id")]

    if not pmids:
        print("No results found.")
        return

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Step 2: Fetch articles in batches to avoid rate limiting
    fetch_url = f"{base_url}efetch.fcgi"
    for i in range(0, len(pmids), batch_size):
        # Split PMIDs into batches
        batch_pmids = pmids[i:i + batch_size]
        fetch_params = {
            "db": "pubmed",
            "id": ",".join(batch_pmids),
            "retmode": "xml"
        }

        try:
            fetch_response = requests.get(fetch_url, params=fetch_params)
            fetch_response.raise_for_status()

            # Split fetched XML data and save each article by PMID
            articles = ElementTree.fromstring(fetch_response.content).findall(".//PubmedArticle")
            for article in articles:
                pmid = article.find(".//PMID").text
                file_path = os.path.join(output_dir, f"{pmid}.xml")

                with open(file_path, 'wb') as file:
                    file.write(ElementTree.tostring(article, encoding='utf-8'))

                print(f"Article {pmid} saved to {file_path}")

        except requests.exceptions.HTTPError as e:
            print(f"Failed to fetch batch {i+1} due to {e}")

        # Sleep to avoid hitting the rate limit
        time.sleep(sleep_time)

# Example usage:
query = "covid-19"
max_results = 5050  # Set the number of articles to download
output_directory = "data"  # Set your desired output directory
batch_size = 10  # Number of articles to fetch in each batch
sleep_time = 0.1  # Time in seconds to sleep between batches
fetch_pubmed_xml(query, max_results, output_directory, batch_size, sleep_time)

Article 39471458 saved to data\39471458.xml
Article 39471397 saved to data\39471397.xml
Article 39471380 saved to data\39471380.xml
Article 39471374 saved to data\39471374.xml
Article 39471373 saved to data\39471373.xml
Article 39471350 saved to data\39471350.xml
Article 39471289 saved to data\39471289.xml
Article 39471286 saved to data\39471286.xml
Article 39471195 saved to data\39471195.xml
Article 39471133 saved to data\39471133.xml
Article 39471107 saved to data\39471107.xml
Article 39471070 saved to data\39471070.xml
Article 39470924 saved to data\39470924.xml
Article 39470901 saved to data\39470901.xml
Article 39470900 saved to data\39470900.xml
Article 39470838 saved to data\39470838.xml
Article 39470791 saved to data\39470791.xml
Article 39470748 saved to data\39470748.xml
Article 39470727 saved to data\39470727.xml
Article 39470726 saved to data\39470726.xml
Article 39470688 saved to data\39470688.xml
Article 39470661 saved to data\39470661.xml
Article 39470637 saved to data\3