In [None]:
import gzip
import os
import requests
import time

def filter_matching_pmids(gene_gz, mutation_gz, output_file):
    """
    Filters PMIDs that are present in both gene2pubtator3 and mutation2pubtator3 files.

    Args:
        gene_gz (str): Path to the gene2pubtator3.gz file.
        mutation_gz (str): Path to the mutation2pubtator3.gz file.
        output_file (str): Path to the output file for matched PMIDs.
    """
    try:
        # Read gene2pubtator3 PMIDs
        with gzip.open(gene_gz, 'rt', encoding='utf-8') as gene_file:
            gene_pmids = set(line.split('\t')[0] for line in gene_file)

        # Read mutation2pubtator3 PMIDs and find intersection
        matching_pmids = set()
        with gzip.open(mutation_gz, 'rt', encoding='utf-8') as mutation_file:
            for line in mutation_file:
                pmid = line.split('\t')[0]
                if pmid in gene_pmids:
                    matching_pmids.add(pmid)

        # Write matching PMIDs to output file
        with open(output_file, 'w', encoding='utf-8') as out_file:
            for pmid in matching_pmids:
                out_file.write(f"{pmid}\n")

        print(f"Successfully wrote {len(matching_pmids)} matching PMIDs to {output_file}")

    except Exception as e:
        print(f"Error filtering PMIDs: {e}")


In [None]:
import gzip
import os
import requests
import time

def fetch_articles(pmids_file, output_dir, email, api_key):
    """
    Fetches article metadata and abstracts using the PubMed API for a list of PMIDs.

    Args:
        pmids_file (str): Path to the file containing PMIDs.
        output_dir (str): Directory to save the fetched articles.
        email (str): Email address for the PubMed API.
        api_key (str): API key for the PubMed API.
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    headers = {"User-Agent": email}
    failure_log = "failed_fetches.txt"

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        with open(pmids_file, 'r', encoding='utf-8') as file:
            pmids = [line.strip() for line in file]

        for i, pmid in enumerate(pmids):
            final_output_path = os.path.join(output_dir, f"{pmid}.txt")
            temp_output_path = os.path.join(output_dir, f"{pmid}.tmp")
            if os.path.exists(final_output_path):
                print(f"PMID {pmid} already fetched. Skipping ({i+1}/{len(pmids)})")
                continue

            params = {
                "db": "pubmed",
                "id": pmid,
                "rettype": "abstract",
                "retmode": "text",
                "api_key": api_key
            }
            try:
                response = requests.get(base_url, headers=headers, params=params)

                if response.status_code == 200:
                    with open(temp_output_path, 'w', encoding='utf-8') as temp_file:
                        temp_file.write(response.text)
                    os.rename(temp_output_path, final_output_path)
                    print(f"Fetched article for PMID {pmid} ({i+1}/{len(pmids)})")
                else:
                    print(f"Failed to fetch PMID {pmid}: {response.status_code}")
                    with open(failure_log, 'a', encoding='utf-8') as fail_file:
                        fail_file.write(f"{pmid}\n")

            except Exception as e:
                print(f"Error fetching PMID {pmid}: {e}")
                with open(failure_log, 'a', encoding='utf-8') as fail_file:
                    fail_file.write(f"{pmid}\n")

            time.sleep(0.13)  # Reduced sleep time to increase request rate

    except Exception as e:
        print(f"Error fetching articles: {e}")

def main():
    # Paths to input files
    gene_gz = "./data/gene2pubtator3.gz"
    mutation_gz = "./data/mutation2pubtator3.gz"

    # Path to output file for matching PMIDs
    matching_pmids_file = "./data/matching_pmids.txt"

    # Directory to save fetched articles
    articles_dir = "./data/fetched_articles"

    # PubMed API credentials
    email = ""  # Replace with your email
    api_key = ""  # Replace with your API key

    # Filter and write matching PMIDs
    #filter_matching_pmids(gene_gz, mutation_gz, matching_pmids_file)

    # Fetch articles for matching PMIDs
    fetch_articles(matching_pmids_file, articles_dir, email, api_key)

if __name__ == "__main__":
    main()


In [13]:
import gzip
import os
import requests
import time

def fetch_articles(pmids_file, output_dir, email, api_key):
    """
    Fetches article metadata and abstracts using the PubMed API for a list of PMIDs.

    Args:
        pmids_file (str): Path to the file containing PMIDs.
        output_dir (str): Directory to save the fetched articles.
        email (str): Email address for the PubMed API.
        api_key (str): API key for the PubMed API.
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    headers = {"User-Agent": email}
    failure_log = os.path.join(output_dir, "failed_fetches.txt")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        with open(pmids_file, 'r', encoding='utf-8') as file:
            pmids = [line.strip() for line in file]

        for i, pmid in enumerate(pmids):
            final_output_path = os.path.join(output_dir, f"{pmid}.txt")
            temp_output_path = os.path.join(output_dir, f"{pmid}.tmp")
            if os.path.exists(final_output_path):
                print(f"PMID {pmid} already fetched. Skipping ({i+1}/{len(pmids)})")
                continue

            params = {
                "db": "pubmed",
                "id": pmid,
                "rettype": "abstract",
                "retmode": "text",
                "api_key": api_key
            }
            try:
                response = requests.get(base_url, headers=headers, params=params)

                if response.status_code == 200:
                    with open(temp_output_path, 'w', encoding='utf-8') as temp_file:
                        temp_file.write(response.text)
                    os.rename(temp_output_path, final_output_path)
                    # print(f"Fetched article for PMID {pmid} ({i+1}/{len(pmids)})")
                else:
                    print(f"Failed to fetch PMID {pmid}: {response.status_code}")
                    with open(failure_log, 'a', encoding='utf-8') as fail_file:
                        fail_file.write(f"{pmid}\n")

            except Exception as e:
                print(f"Error fetching PMID {pmid}: {e}")
                with open(failure_log, 'a', encoding='utf-8') as fail_file:
                    fail_file.write(f"{pmid}\n")

            time.sleep(0.1105)  # Reduced sleep time to increase request rate

    except Exception as e:
        print(f"Error fetching articles: {e}")

def main():
    # Directory containing subset files
    subsets_dir = "./data/matching_pmids_subsets"
    
    # Directory to save fetched articles
    fetched_articles_subsets_dir = "./data/fetched_articles_subsets"
    
    # PubMed API credentials
    email = "dolor@ualberta.ca"  # Replace with your email
    api_key = "1430b2c07e9cc8157786d85183067d7e4708"  # Replace with your API key

    # Iterate over each subset file and fetch articles
    for subset_file in os.listdir(subsets_dir):
        print(f"Processing subset file: {subset_file}")
        start_time = time.time()
        if subset_file.endswith(".txt"):
            subset_path = os.path.join(subsets_dir, subset_file)
            subset_output_dir = os.path.join(fetched_articles_subsets_dir, os.path.splitext(subset_file)[0])
            fetch_articles(subset_path, subset_output_dir, email, api_key)
        print(f"Finished processing subset file: {subset_file}. Took {time.time() - start_time:.2f} seconds")
if __name__ == "__main__":
    main()

Processing subset file: subset_1.txt
PMID 29285173 already fetched. Skipping (1/18599)
PMID 26816539 already fetched. Skipping (2/18599)
PMID 37852671 already fetched. Skipping (3/18599)
PMID 20484129 already fetched. Skipping (4/18599)
PMID 30745825 already fetched. Skipping (5/18599)
PMID 36010642 already fetched. Skipping (6/18599)
PMID 35903126 already fetched. Skipping (7/18599)
PMID 35979433 already fetched. Skipping (8/18599)
PMID 26204423 already fetched. Skipping (9/18599)
PMID 34944759 already fetched. Skipping (10/18599)
PMID 23707372 already fetched. Skipping (11/18599)
PMID 22685551 already fetched. Skipping (12/18599)
PMID 36037371 already fetched. Skipping (13/18599)
PMID 32218512 already fetched. Skipping (14/18599)
PMID 32984281 already fetched. Skipping (15/18599)
PMID 34603665 already fetched. Skipping (16/18599)
PMID 8787920 already fetched. Skipping (17/18599)
PMID 28717233 already fetched. Skipping (18/18599)
PMID 34286374 already fetched. Skipping (19/18599)
PMID

KeyboardInterrupt: 