In [2]:
from Bio import Entrez
import time

# Set your email here
Entrez.email = "relaxchumma@gmail.com"

def fetch_pmids(journal_name, retmax, year):
    # E-utilities search query
    search_query = f'"{journal_name}"[Journal] AND ("{year}"[Date - Publication])'
    handle = Entrez.esearch(db="pubmed", term=search_query, retmax=retmax)
    record = Entrez.read(handle)
    handle.close()
    return record['IdList'], int(record['Count'])

def fetch_all_pmids(journal_name):
    pmids = []
    retmax = 10000  # Number of results to fetch at a time
    for year in range(1945, 2025):
        pmid_list, total_count = fetch_pmids(journal_name, retmax, year)
        print(f"Year: {year}, Total count: {total_count}")
        pmids.extend(pmid_list)
        time.sleep(0.5)  # To avoid hitting the server too hard
    return pmids

total_pmid = fetch_all_pmids("Lancet")
print(f"Total PMIDs fetched: {len(total_pmid)}")

HTTPError: HTTP Error 500: Internal Server Error

In [2]:
uniques = set(total_pmid)
uniques = list(uniques)
print(len(uniques))

import pickle
with open("store_uniques.pkl", 'wb') as f:
    pickle.dump(uniques, f)

143632


In [1]:
import pickle
with open("store_uniques.pkl", 'rb') as f:
    uniques = pickle.load(f)
print(len(uniques))
print(uniques[54100])

143632
7903995


In [3]:
from Bio import Entrez
from xml.etree import ElementTree as ET

def check_pmc_ids(pm_ids):
    Entrez.email = "relaxchumma@gmail.com"
    
    # Join the PubMed IDs into a single comma-separated string
    ids_str = ','.join(pm_ids)
    
    handle = Entrez.efetch(db="pubmed", id=ids_str, rettype="full", retmode="xml")
    xml_data = handle.read()
    handle.close()
    root = ET.fromstring(xml_data)
    
    pmc_dict = {}
    
    for article in root.findall(".//PubmedArticle"):
        pmid = article.find(".//PMID").text
        pmc_id = None
        for article_id in article.findall(".//ArticleId"):
            if article_id.attrib.get("IdType") == "pmc":
                pmc_id = article_id.text
                break
        pmc_dict[pmid] = pmc_id is not None
    
    return pmc_dict

# Batching the PubMed IDs for efficient fetching
batch_size = 100
for i in range(54100, len(uniques), batch_size):
    pmid_batch = uniques[i:i + batch_size]
    pmc_dict = check_pmc_ids(pmid_batch)
    
    for pmid, is_pmc in pmc_dict.items():
        if is_pmc:
            with open("pmc.txt", 'a') as f:
                f.write(f'{pmid}\n')
        else:
            with open("non_pmc.txt", 'a') as f:
                f.write(f'{pmid}\n')