In [None]:
import gzip
import os
import requests
import time

def filter_matching_pmids(gene_gz, mutation_gz, output_file):
    """
    Filters PMIDs that are present in both gene2pubtator3 and mutation2pubtator3 files.

    Args:
        gene_gz (str): Path to the gene2pubtator3.gz file.
        mutation_gz (str): Path to the mutation2pubtator3.gz file.
        output_file (str): Path to the output file for matched PMIDs.
    """
    try:
        # Read gene2pubtator3 PMIDs
        with gzip.open(gene_gz, 'rt', encoding='utf-8') as gene_file:
            gene_pmids = set(line.split('\t')[0] for line in gene_file)

        # Read mutation2pubtator3 PMIDs and find intersection
        matching_pmids = set()
        with gzip.open(mutation_gz, 'rt', encoding='utf-8') as mutation_file:
            for line in mutation_file:
                pmid = line.split('\t')[0]
                if pmid in gene_pmids:
                    matching_pmids.add(pmid)

        # Write matching PMIDs to output file
        with open(output_file, 'w', encoding='utf-8') as out_file:
            for pmid in matching_pmids:
                out_file.write(f"{pmid}\n")

        print(f"Successfully wrote {len(matching_pmids)} matching PMIDs to {output_file}")

    except Exception as e:
        print(f"Error filtering PMIDs: {e}")


Successfully wrote 1097341 matching PMIDs to matching_pmids.txt
Fetched article for PMID 29285173 (1/1097341)
Fetched article for PMID 26816539 (2/1097341)
Fetched article for PMID 37852671 (3/1097341)
Fetched article for PMID 20484129 (4/1097341)
Fetched article for PMID 30745825 (5/1097341)
Fetched article for PMID 36010642 (6/1097341)
Fetched article for PMID 35903126 (7/1097341)
Fetched article for PMID 35979433 (8/1097341)
Fetched article for PMID 26204423 (9/1097341)
Fetched article for PMID 34944759 (10/1097341)
Fetched article for PMID 23707372 (11/1097341)
Fetched article for PMID 22685551 (12/1097341)
Fetched article for PMID 36037371 (13/1097341)
Fetched article for PMID 32218512 (14/1097341)
Fetched article for PMID 32984281 (15/1097341)
Fetched article for PMID 34603665 (16/1097341)
Fetched article for PMID 8787920 (17/1097341)
Fetched article for PMID 28717233 (18/1097341)
Fetched article for PMID 34286374 (19/1097341)
Fetched article for PMID 32849807 (20/1097341)
Fetche

In [3]:
import gzip
import os
import requests
import time

def fetch_articles(pmids_file, output_dir, email, api_key):
    """
    Fetches article metadata and abstracts using the PubMed API for a list of PMIDs.

    Args:
        pmids_file (str): Path to the file containing PMIDs.
        output_dir (str): Directory to save the fetched articles.
        email (str): Email address for the PubMed API.
        api_key (str): API key for the PubMed API.
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    headers = {"User-Agent": email}

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        with open(pmids_file, 'r', encoding='utf-8') as file:
            pmids = [line.strip() for line in file]

        for i, pmid in enumerate(pmids):
            params = {
                "db": "pubmed",
                "id": pmid,
                "rettype": "abstract",
                "retmode": "text",
                "api_key": api_key
            }
            response = requests.get(base_url, headers=headers, params=params)

            if response.status_code == 200:
                with open(os.path.join(output_dir, f"{pmid}.txt"), 'w', encoding='utf-8') as out_file:
                    out_file.write(response.text)
                print(f"Fetched article for PMID {pmid} ({i+1}/{len(pmids)})")
            else:
                print(f"Failed to fetch PMID {pmid}: {response.status_code}")

            time.sleep(0.5)  # To avoid hitting API rate limits

    except Exception as e:
        print(f"Error fetching articles: {e}")

def main():
    # Paths to input files
    gene_gz = "./data/gene2pubtator3.gz"
    mutation_gz = "./data/mutation2pubtator3.gz"

    # Path to output file for matching PMIDs
    matching_pmids_file = "./data/matching_pmids.txt"

    # Directory to save fetched articles
    articles_dir = "./data/fetched_articles"

    # PubMed API credentials
    email = ""  # Replace with your email
    api_key = ""  # Replace with your API key

    # Filter and write matching PMIDs
    #filter_matching_pmids(gene_gz, mutation_gz, matching_pmids_file)

    # Fetch articles for matching PMIDs
    fetch_articles(matching_pmids_file, articles_dir, email, api_key)

if __name__ == "__main__":
    main()


Fetched article for PMID 29285173 (1/1097341)
Fetched article for PMID 26816539 (2/1097341)
Fetched article for PMID 37852671 (3/1097341)
Fetched article for PMID 20484129 (4/1097341)
Fetched article for PMID 30745825 (5/1097341)
Fetched article for PMID 36010642 (6/1097341)
Fetched article for PMID 35903126 (7/1097341)
Fetched article for PMID 35979433 (8/1097341)
Fetched article for PMID 26204423 (9/1097341)
Fetched article for PMID 34944759 (10/1097341)
Fetched article for PMID 23707372 (11/1097341)
Fetched article for PMID 22685551 (12/1097341)
Fetched article for PMID 36037371 (13/1097341)
Fetched article for PMID 32218512 (14/1097341)
Fetched article for PMID 32984281 (15/1097341)
Fetched article for PMID 34603665 (16/1097341)
Fetched article for PMID 8787920 (17/1097341)
Fetched article for PMID 28717233 (18/1097341)
Fetched article for PMID 34286374 (19/1097341)
Fetched article for PMID 32849807 (20/1097341)
Fetched article for PMID 36171780 (21/1097341)
Fetched article for PMI