In [1]:
import os
import time
import requests
from xml.etree import ElementTree

def fetch_pubmed_xml(query, max_results=10, output_dir="output", batch_size=5, sleep_time=1):
    # Base URL for Entrez E-utilities
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

    # Step 1: Use ESearch to get the list of PubMed IDs (PMIDs) for the query
    search_url = f"{base_url}esearch.fcgi"
    search_params = {
        "db": "pubmed",
        "term": query,
        "retmax": max_results,
        "retmode": "xml"
    }

    search_response = requests.get(search_url, params=search_params)
    search_response.raise_for_status()
    search_xml = ElementTree.fromstring(search_response.content)

    # Extract PMIDs from the search results
    pmids = [id_elem.text for id_elem in search_xml.findall(".//Id")]

    if not pmids:
        print("No results found.")
        return

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Step 2: Fetch articles in batches to avoid rate limiting
    fetch_url = f"{base_url}efetch.fcgi"
    for i in range(0, len(pmids), batch_size):
        # Split PMIDs into batches
        batch_pmids = pmids[i:i + batch_size]
        fetch_params = {
            "db": "pubmed",
            "id": ",".join(batch_pmids),
            "retmode": "xml"
        }

        try:
            fetch_response = requests.get(fetch_url, params=fetch_params)
            fetch_response.raise_for_status()

            # Split fetched XML data and save each article by PMID
            articles = ElementTree.fromstring(fetch_response.content).findall(".//PubmedArticle")
            for article in articles:
                pmid = article.find(".//PMID").text
                file_path = os.path.join(output_dir, f"{pmid}.xml")

                with open(file_path, 'wb') as file:
                    file.write(ElementTree.tostring(article, encoding='utf-8'))

                print(f"Article {pmid} saved to {file_path}")

        except requests.exceptions.HTTPError as e:
            print(f"Failed to fetch batch {i+1} due to {e}")

        # Sleep to avoid hitting the rate limit
        time.sleep(sleep_time)

# Example usage:
query = "cybersecurity"
max_results = 10  # Set the number of articles to download
output_directory = "pubmed_xml2"  # Set your desired output directory
batch_size = 5  # Number of articles to fetch in each batch
sleep_time = 0.1  # Time in seconds to sleep between batches
fetch_pubmed_xml(query, max_results, output_directory, batch_size, sleep_time)


Article 39256625 saved to pubmed_xml2/39256625.xml
Article 39255289 saved to pubmed_xml2/39255289.xml
Article 39253236 saved to pubmed_xml2/39253236.xml
Article 39250110 saved to pubmed_xml2/39250110.xml
Article 39248619 saved to pubmed_xml2/39248619.xml
Article 39245678 saved to pubmed_xml2/39245678.xml
Article 39244512 saved to pubmed_xml2/39244512.xml
Article 39243017 saved to pubmed_xml2/39243017.xml
Article 39242659 saved to pubmed_xml2/39242659.xml
Article 39241940 saved to pubmed_xml2/39241940.xml
