# step 0: obtaining an API key¶

#the signin-page of the NBCI  : i could not enter this site to get API 

# step 1: install and use the biopython package

In [None]:
# Function Definition: def download_article(article_id, api_key) defines a function named download_article that takes two parameters: article_id (a string representing the PubMed article ID) and api_key (a string representing the user's NCBI API key).
# Fetching Data:
# handle = Entrez.efetch(db="pubmed", id=article_id, retmode="xml", api_key=api_key): This line uses the efetch function from the Entrez module to fetch the article data from the PubMed database. The data is returned in XML format. The api_key parameter is used for authentication with the NCBI servers.
# Reading Data: xml_data = handle.read() reads the data fetched by efetch into a variable xml_data.
# Creating Filename: filename = f"{article_id}.xml" creates a filename using the article_id, ensuring that each article is saved with a unique name based on its ID.
# Writing Data to File:
# with open(filename, 'w') as file: Opens a new file in write mode ('w'), with the filename created above.
# file.write(xml_data): Writes the fetched XML data to the file.


from Bio import Entrez

import os

def download_article(article_id, api_key):
    handle = Entrez.efetch(db="pubmed", id=article_id, retmode="xml", api_key=api_key)
    xml_data = handle.read()
    filename = f"{article_id}.xml"
    with open(filename, 'w') as file:
        file.write(xml_data)

        #since i did not have api_key I have no output

In [None]:
import multiprocessing as mp

def parallel_download(article_ids, api_key):
    with mp.Pool() as pool:
        pool.starmap(download_article, [(article_id, api_key) for article_id in article_ids])
# #The parallel_download function uses the multiprocessing module to download articles in parallel.
# It creates a pool of worker processes and uses the starmap method to execute the download_article function with multiple arguments.
# The function distributes the task of downloading articles across multiple processes, improving efficiency and reducing download time.

In [None]:
if __name__ == "__main__":
    Entrez.email = '<YOUR EMAIL HERE>'
    api_key = '<YOUR API KEY HERE>'
    pubmed_id = '30049270'
    
    file = Entrez.elink(dbfrom="pubmed", db="pmc", LinkName="pubmed_pmc_refs", id=pubmed_id, api_key=api_key)
    results = Entrez.read(file)
    references = [f'{link["Id"]}' for link in results[0]["LinkSetDb"][0]["Link"]][:10]  # Limit to 10

    parallel_download(references, api_key)
# use of the above function.

# Extra

In [None]:
# Single-Threaded Approach
#Define the Sequential Download Function:
#This function will download and save articles one by one.

from Biopython import Entrez

def download_article_single(article_id, api_key):
    handle = Entrez.efetch(db="pubmed", id=article_id, retmode="xml", api_key=api_key)
    xml_data = handle.read()
    filename = f"{article_id}.xml"
    with open(filename, 'w') as file:
        file.write(xml_data)
        
# The download_article_single function is a single-threaded approach for downloading and saving articles one by one.
# It uses the Biopython Entrez module to fetch article data from PubMed in XML format.
# The function reads the data,creates a filename based on the article ID, and saves the XML data to a file.
# This approach processes each article sequentially,one at a time.


In [None]:
#Implementing the Sequential Download Logic.
#Fetch references and download articles sequentially.
import time
from Bio import Entrez

def sequential_download(article_ids, api_key):
    for article_id in article_ids:
        download_article_single(article_id, api_key)

if __name__ == "__main__":
    Entrez.email = '<YOUR EMAIL HERE>'
    api_key = '<YOUR API KEY HERE>'
    pubmed_id = '30049270'
    
    # Fetch references
    file = Entrez.elink(dbfrom="pubmed", db="pmc", LinkName="pubmed_pmc_refs", id=pubmed_id, api_key=api_key)
    results = Entrez.read(file)
    references = [f'{link["Id"]}' for link in results[0]["LinkSetDb"][0]["Link"]][:10]  # Limit to 10

    # Measure time for sequential download
    start_time = time.time()
    sequential_download(references, api_key)
    end_time = time.time()
    print(f"Sequential download took {end_time - start_time} seconds")


In [None]:
#Step 2: Multi-Threaded Approach
#Defining the Parallel Download Function.
# this function will use multiprocessing to download articles concurrently.
import multiprocessing as mp
from Bio import Entrez

def download_article(article_id, api_key):
    handle = Entrez.efetch(db="pubmed", id=article_id, retmode="xml", api_key=api_key)
    xml_data = handle.read()
    filename = f"{article_id}.xml"
    with open(filename, 'w') as file:
        file.write(xml_data)

def parallel_download(article_ids, api_key):
    with mp.Pool() as pool:
        pool.starmap(download_article, [(article_id, api_key) for article_id in article_ids])

        
#This function processes articles one at a time in a sequential manner, which means it handles each article individually 
#and waits until one article is fully processed before moving on to the next. This approach is straightforward but may not
#be efficient for handling a large number of articles due to the sequential nature of the process.

In [None]:
#implement Parallel Download Logic:
#Fetch references and download articles using multiprocessing.


import time
from Bio import Entrez
import multiprocessing as mp

if __name__ == "__main__":
    Entrez.email = '<YOUR EMAIL HERE>'
    api_key = '<YOUR API KEY HERE>'
    pubmed_id = '30049270'
    
    # Fetch references
    file = Entrez.elink(dbfrom="pubmed", db="pmc", LinkName="pubmed_pmc_refs", id=pubmed_id, api_key=api_key)
    results = Entrez.read(file)
    references = [f'{link["Id"]}' for link in results[0]["LinkSetDb"][0]["Link"]][:10]  # Limit to 10

    # Measure time for parallel download
    start_time = time.time()
    parallel_download(references, api_key)
    end_time = time.time()
    print(f"Parallel download took {end_time - start_time} seconds")
#This script demonstrates how to efficiently download a large number of articles by leveraging parallel
#processing to speed up the data retrieval process.



#it includes:
# Setup: Import necessary modules and define parameters such as email, API key, and PubMed ID.
# Fetch References: Query PubMed to get related article IDs and limit to the first 10.
# Parallel Download: Use multiprocessing to download the articles concurrently.
# Measure Time: Calculate and print the time taken for the parallel download to assess performance improvements.
