In [None]:
import gzip
import os
import requests
import time

def filter_matching_pmids(gene_gz, mutation_gz, output_file):
    """
    Filters PMIDs that are present in both gene2pubtator3 and mutation2pubtator3 files.

    Args:
        gene_gz (str): Path to the gene2pubtator3.gz file.
        mutation_gz (str): Path to the mutation2pubtator3.gz file.
        output_file (str): Path to the output file for matched PMIDs.
    """
    try:
        # Read gene2pubtator3 PMIDs
        with gzip.open(gene_gz, 'rt', encoding='utf-8') as gene_file:
            gene_pmids = set(line.split('\t')[0] for line in gene_file)

        # Read mutation2pubtator3 PMIDs and find intersection
        matching_pmids = set()
        with gzip.open(mutation_gz, 'rt', encoding='utf-8') as mutation_file:
            for line in mutation_file:
                pmid = line.split('\t')[0]
                if pmid in gene_pmids:
                    matching_pmids.add(pmid)

        # Write matching PMIDs to output file
        with open(output_file, 'w', encoding='utf-8') as out_file:
            for pmid in matching_pmids:
                out_file.write(f"{pmid}\n")

        print(f"Successfully wrote {len(matching_pmids)} matching PMIDs to {output_file}")

    except Exception as e:
        print(f"Error filtering PMIDs: {e}")


In [None]:
import gzip
import os
import requests
import time

def fetch_articles(pmids_file, output_dir, email, api_key):
    """
    Fetches article metadata and abstracts using the PubMed API for a list of PMIDs.

    Args:
        pmids_file (str): Path to the file containing PMIDs.
        output_dir (str): Directory to save the fetched articles.
        email (str): Email address for the PubMed API.
        api_key (str): API key for the PubMed API.
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    headers = {"User-Agent": email}
    failure_log = "failed_fetches.txt"

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        with open(pmids_file, 'r', encoding='utf-8') as file:
            pmids = [line.strip() for line in file]

        for i, pmid in enumerate(pmids):
            final_output_path = os.path.join(output_dir, f"{pmid}.txt")
            temp_output_path = os.path.join(output_dir, f"{pmid}.tmp")
            if os.path.exists(final_output_path):
                print(f"PMID {pmid} already fetched. Skipping ({i+1}/{len(pmids)})")
                continue

            params = {
                "db": "pubmed",
                "id": pmid,
                "rettype": "abstract",
                "retmode": "text",
                "api_key": api_key
            }
            try:
                response = requests.get(base_url, headers=headers, params=params)

                if response.status_code == 200:
                    with open(temp_output_path, 'w', encoding='utf-8') as temp_file:
                        temp_file.write(response.text)
                    os.rename(temp_output_path, final_output_path)
                    print(f"Fetched article for PMID {pmid} ({i+1}/{len(pmids)})")
                else:
                    print(f"Failed to fetch PMID {pmid}: {response.status_code}")
                    with open(failure_log, 'a', encoding='utf-8') as fail_file:
                        fail_file.write(f"{pmid}\n")

            except Exception as e:
                print(f"Error fetching PMID {pmid}: {e}")
                with open(failure_log, 'a', encoding='utf-8') as fail_file:
                    fail_file.write(f"{pmid}\n")

            time.sleep(0.13)  # Reduced sleep time to increase request rate

    except Exception as e:
        print(f"Error fetching articles: {e}")

def main():
    # Paths to input files
    gene_gz = "./data/gene2pubtator3.gz"
    mutation_gz = "./data/mutation2pubtator3.gz"

    # Path to output file for matching PMIDs
    matching_pmids_file = "./data/matching_pmids.txt"

    # Directory to save fetched articles
    articles_dir = "./data/fetched_articles"

    # PubMed API credentials
    email = ""  # Replace with your email
    api_key = ""  # Replace with your API key

    # Filter and write matching PMIDs
    #filter_matching_pmids(gene_gz, mutation_gz, matching_pmids_file)

    # Fetch articles for matching PMIDs
    fetch_articles(matching_pmids_file, articles_dir, email, api_key)

if __name__ == "__main__":
    main()


In [None]:
import gzip
import os
import requests
import time

def fetch_articles(pmids_file, output_dir, email, api_key, failure_log_dir):
    """
    Fetches article metadata and abstracts using the PubMed API for a list of PMIDs.

    Args:
        pmids_file (str): Path to the file containing PMIDs.
        output_dir (str): Directory to save the fetched articles.
        email (str): Email address for the PubMed API.
        api_key (str): API key for the PubMed API.
        failure_log_dir (str): Directory to save the failure logs.
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    headers = {"User-Agent": email}
    subset_name = os.path.splitext(os.path.basename(pmids_file))[0]
    failure_log = os.path.join(failure_log_dir, f"{subset_name}_failed_fetches.txt")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    if not os.path.exists(failure_log_dir):
        os.makedirs(failure_log_dir)

    try:
        with open(pmids_file, 'r', encoding='utf-8') as file:
            pmids = [line.strip() for line in file]

        for i, pmid in enumerate(pmids):
            final_output_path = os.path.join(output_dir, f"{pmid}.txt")
            temp_output_path = os.path.join(output_dir, f"{pmid}.tmp")
            if os.path.exists(final_output_path):
                print(f"PMID {pmid} already fetched. Skipping ({i+1}/{len(pmids)})")
                continue

            # https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly
            params = {
                "db": "pubmed",
                "id": pmid,
                "rettype": "abstract",
                "retmode": "text",
                "api_key": api_key
            }
            try:
                response = requests.get(base_url, headers=headers, params=params)

                if response.status_code == 200:
                    with open(temp_output_path, 'w', encoding='utf-8') as temp_file:
                        temp_file.write(response.text)
                    os.rename(temp_output_path, final_output_path)
                    # print(f"Fetched article for PMID {pmid} ({i+1}/{len(pmids)})")
                else:
                    print(f"Failed to fetch PMID {pmid}: {response.status_code}")
                    with open(failure_log, 'a', encoding='utf-8') as fail_file:
                        fail_file.write(f"{pmid}\n")

            except Exception as e:
                print(f"Error fetching PMID {pmid}: {e}")
                with open(failure_log, 'a', encoding='utf-8') as fail_file:
                    fail_file.write(f"{pmid}\n")

            time.sleep(0.1105)  # Reduced sleep time to increase request rate

    except Exception as e:
        print(f"Error fetching articles: {e}")

def main():
    # # Directory containing subset files
    # subsets_dir = "./data/matching_pmids_subsets"
    
    # # Directory to save fetched articles
    # fetched_articles_subsets_dir = "./data/fetched_articles_subsets"
    
    # Directory to save failure logs
    failure_log_dir = "./data/failed_fetches"
    
    # PubMed API credentials
    email = ""  # Replace with your email
    api_key = ""  # Replace with your API key

    # Iterate over each subset file and fetch articles
    for subset_file in os.listdir(subsets_dir):
        print(f"Processing subset file: {subset_file}")
        start_time = time.time()
        if subset_file.endswith(".txt"):
            subset_path = os.path.join(subsets_dir, subset_file)
            subset_output_dir = os.path.join(fetched_articles_subsets_dir, os.path.splitext(subset_file)[0])
            fetch_articles(subset_path, subset_output_dir, email, api_key, failure_log_dir)
        print(f"Finished processing subset file: {subset_file}. Took {time.time() - start_time:.2f} seconds")

if __name__ == "__main__":
    main()

Fetched single PMID 35351360 in XML format.


# Fetch full-text articles in BioC format from PMC OA. 
The reason why I didn't use entrez is because the full-text articles fetched from it is in JATS XML which Pubtator3/AIONER can't process.I used BioC API for PMC Open Access instead [link here](https://www.ncbi.nlm.nih.gov/research/bionlp/APIs/BioC-PMC/)

In [None]:
import os
import requests
import time
from tqdm import tqdm  # Progress bar

def fetch_pmc_articles(pmcids_file, output_dir, failure_log_dir):
    """
    Fetches full-text PMC articles in BioC XML format using the BioC API.

    Args:
        pmcids_file (str): Path to the file containing PMCIDs.
        output_dir (str): Directory to save the fetched articles.
        failure_log_dir (str): Directory to save the failure logs.
    """
    # ascii instead of unicode for easier processing and compatibility
    base_url = "https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/{}/ascii"
    
    subset_name = os.path.splitext(os.path.basename(pmcids_file))[0]
    failure_log = os.path.join(failure_log_dir, f"{subset_name}_failed_fetches.txt")

    # Ensure directories exist
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(failure_log_dir, exist_ok=True)

    try:
        with open(pmcids_file, 'r', encoding='utf-8') as file:
            pmcids = [line.strip().split("\t")[1] for line in file if line.strip()]  # Extract only PMCIDs

        # Initialize tqdm progress bar
        for i, pmcid in enumerate(tqdm(pmcids, desc=f"Downloading {subset_name}", unit="article")):
            final_output_path = os.path.join(output_dir, f"{pmcid}.xml")
            temp_output_path = os.path.join(output_dir, f"{pmcid}.tmp")

            # Skip if already fetched
            if os.path.exists(final_output_path):
                continue

            # Fetch article from BioC API
            url = base_url.format(pmcid)
            try:
                response = requests.get(url, timeout=10)

                if response.status_code == 200:
                    with open(temp_output_path, 'w', encoding='utf-8') as temp_file:
                        temp_file.write(response.text)
                    os.rename(temp_output_path, final_output_path)  # Rename after successful fetch
                else:
                    with open(failure_log, 'a', encoding='utf-8') as fail_file:
                        fail_file.write(f"{pmcid}\n")

            except Exception as e:
                with open(failure_log, 'a', encoding='utf-8') as fail_file:
                    fail_file.write(f"{pmcid}\n")

            time.sleep(0.5)  # Prevent overloading the API

    except Exception as e:
        print(f"❌ Error processing {pmcids_file}: {e}")

def fetch_single_pmc_article(pmcid, output_dir, failure_log_dir):
    """
    Fetches a single full-text PMC article in BioC XML format using the BioC API.

    Args:
        pmcid (str): The PMCID of the article to fetch.
        output_dir (str): Directory to save the fetched article.
        failure_log_dir (str): Directory to save the failure log.
    """
    base_url = "https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/{}/ascii"
    failure_log = os.path.join(failure_log_dir, "failed_fetches.txt")

    # Ensure directories exist
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(failure_log_dir, exist_ok=True)

    final_output_path = os.path.join(output_dir, f"{pmcid}.xml")
    temp_output_path = os.path.join(output_dir, f"{pmcid}.tmp")

    # Skip if already fetched
    if os.path.exists(final_output_path):
        print(f"Article {pmcid} already fetched.")
        return

    # Fetch article from BioC API
    url = base_url.format(pmcid)
    try:
        response = requests.get(url, timeout=10)

        if response.status_code == 200:
            with open(temp_output_path, 'w', encoding='utf-8') as temp_file:
                temp_file.write(response.text)
            os.rename(temp_output_path, final_output_path)  # Rename after successful fetch
            print(f"Successfully fetched article {pmcid}.")
        else:
            with open(failure_log, 'a', encoding='utf-8') as fail_file:
                fail_file.write(f"{pmcid}\n")
            print(f"Failed to fetch article {pmcid}.")

    except Exception as e:
        with open(failure_log, 'a', encoding='utf-8') as fail_file:
            fail_file.write(f"{pmcid}\n")
        print(f"Error fetching article {pmcid}: {e}")

    time.sleep(0.5)  # Prevent overloading the API

def fetch_single_pmid_article(pmid, output_dir, failure_log_dir):
    """
    Fetches a single article in BioC XML format for a given PMID using the RESTful service.
    """
    base_url = "https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pubmed.cgi/BioC_xml/{}/ascii"
    failure_log = os.path.join(failure_log_dir, "failed_fetches.txt")

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(failure_log_dir, exist_ok=True)

    final_output_path = os.path.join(output_dir, f"{pmid}.xml")
    temp_output_path = os.path.join(output_dir, f"{pmid}.tmp")

    if os.path.exists(final_output_path):
        print(f"Article {pmid} already fetched.")
        return

    url = base_url.format(pmid)
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200 and response.text.startswith('<?xml'):
            with open(temp_output_path, 'w', encoding='utf-8') as temp_file:
                temp_file.write(response.text)
            os.rename(temp_output_path, final_output_path)
            print(f"Successfully fetched article {pmid}.")
        else:
            with open(failure_log, 'a', encoding='utf-8') as fail_file:
                fail_file.write(f"{pmid}\n")
            print(f"Failed to fetch article {pmid}.")
    except Exception as e:
        with open(failure_log, 'a', encoding='utf-8') as fail_file:
            fail_file.write(f"{pmid}\n")
        print(f"Error fetching article {pmid}: {e}")

    time.sleep(0.5)

def main():
    # # ============================================ Fetch full-text PMC articles (set of full texts) ============================================
    # # Directory containing subset files
    # subsets_dir = r"C:\Users\aivan\Desktop\BIOIN 401\GOLLM\data\matching_pmcids_subsets"

    # # Directory to save fetched full-text articles
    # fetched_articles_subsets_dir = r"C:\Users\aivan\Desktop\BIOIN 401\GOLLM\data\fetched_full_articles_subsets"

    # # Directory to save failure logs
    # failure_log_dir = r"C:\Users\aivan\Desktop\BIOIN 401\GOLLM\data\failed_full_text_fetches"

    # # Iterate over each subset file and fetch articles
    # for subset_file in os.listdir(subsets_dir):
    #     print(f"\n📂 Processing subset file: {subset_file}")
    #     start_time = time.time()

    #     if subset_file.endswith(".txt"):
    #         subset_path = os.path.join(subsets_dir, subset_file)
    #         subset_output_dir = os.path.join(fetched_articles_subsets_dir, os.path.splitext(subset_file)[0])

    #         fetch_pmc_articles(subset_path, subset_output_dir, failure_log_dir)

    #     print(f"✅ Finished processing {subset_file}. Took {time.time() - start_time:.2f} seconds")

    # # ============================================ Fetch a single PMC article (full text) ============================================
    # pmcid = "PMC1866366"
    # output_dir = r"C:\Users\aivan\Desktop\BIOIN 401\GOLLM\data\fetched_single_article"
    # failure_log_dir = r"C:\Users\aivan\Desktop\BIOIN 401\GOLLM\data\failed_single_article_fetch"

    # fetch_single_pmc_article(pmcid, output_dir, failure_log_dir)

    # ============================================ Fetch a single PMID article (abstract) ============================================
    pmid = "35351360"
    output_dir = r"C:\Users\aivan\Desktop\BIOIN 401\GOLLM\data\fetched_single_article"
    failure_log_dir = r"C:\Users\aivan\Desktop\BIOIN 401\GOLLM\data\failed_single_article_fetch"
    fetch_single_pmid_article(pmid, output_dir, failure_log_dir)



if __name__ == "__main__":
    main()


Successfully fetched article 35351360.
