In [11]:
from Bio import Entrez
import time

# Set your email here
Entrez.email = "relaxchumma@gmail.com"

def fetch_pmids(journal_name, retmax, year):
    # E-utilities search query
    search_query = f'"{journal_name}"[Journal] AND ("{year}"[Date - Publication])'
    handle = Entrez.esearch(db="pubmed", term=search_query, retmax=retmax)
    record = Entrez.read(handle)
    handle.close()
    return record['IdList'], int(record['Count'])

def fetch_all_pmids(journal_name):
    pmids = []
    retmax = 10000  # Number of results to fetch at a time
    for year in range(1915, 2025):
        pmid_list, total_count = fetch_pmids(journal_name, retmax, year)
        print(f"Year: {year}, Total count: {total_count}")
        pmids.extend(pmid_list)
        time.sleep(0.5)  # To avoid hitting the server too hard
    return pmids

total_pmid = fetch_all_pmids("Proceedings of the National Academy of Sciences of the United States of America")
print(f"Total PMIDs fetched: {len(total_pmid)}")

Year: 1915, Total count: 169
Year: 1916, Total count: 177
Year: 1917, Total count: 169
Year: 1918, Total count: 92
Year: 1919, Total count: 106
Year: 1920, Total count: 139
Year: 1921, Total count: 81
Year: 1922, Total count: 98
Year: 1923, Total count: 111
Year: 1924, Total count: 123
Year: 1925, Total count: 195
Year: 1926, Total count: 173
Year: 1927, Total count: 201
Year: 1928, Total count: 210
Year: 1929, Total count: 192
Year: 1930, Total count: 151
Year: 1931, Total count: 141
Year: 1932, Total count: 148
Year: 1933, Total count: 198
Year: 1934, Total count: 153
Year: 1935, Total count: 150
Year: 1936, Total count: 147
Year: 1937, Total count: 134
Year: 1938, Total count: 118
Year: 1939, Total count: 142
Year: 1940, Total count: 142
Year: 1941, Total count: 128
Year: 1942, Total count: 115
Year: 1943, Total count: 80
Year: 1944, Total count: 79
Year: 1945, Total count: 83
Year: 1946, Total count: 71
Year: 1947, Total count: 83
Year: 1948, Total count: 110
Year: 1949, Total coun

In [12]:
uniques = set(total_pmid)
uniques = list(uniques)
print(len(uniques))

import pickle
with open("store_uniques.pkl", 'wb') as f:
    pickle.dump(uniques, f)

156367


In [1]:
import pickle
with open("store_uniques.pkl", 'rb') as f:
    uniques = pickle.load(f)
print(len(uniques))

156367


In [5]:
print(uniques[77910])

15067124


In [6]:
from Bio import Entrez
from xml.etree import ElementTree as ET

def check_pmc_ids(pm_ids):
    Entrez.email = "relaxchumma@gmail.com"
    
    # Join the PubMed IDs into a single comma-separated string
    ids_str = ','.join(pm_ids)
    
    handle = Entrez.efetch(db="pubmed", id=ids_str, rettype="full", retmode="xml")
    xml_data = handle.read()
    handle.close()
    root = ET.fromstring(xml_data)
    
    pmc_dict = {}
    
    for article in root.findall(".//PubmedArticle"):
        pmid = article.find(".//PMID").text
        pmc_id = None
        for article_id in article.findall(".//ArticleId"):
            if article_id.attrib.get("IdType") == "pmc":
                pmc_id = article_id.text
                break
        pmc_dict[pmid] = pmc_id is not None
    
    return pmc_dict

# Batching the PubMed IDs for efficient fetching
batch_size = 50
for i in range(77910, len(uniques), batch_size):
    pmid_batch = uniques[i:i + batch_size]
    pmc_dict = check_pmc_ids(pmid_batch)
    
    for pmid, is_pmc in pmc_dict.items():
        if is_pmc:
            with open("pmc_1.txt", 'a') as f:
                f.write(f'{pmid}\n')
        else:
            with open("non_pmc_1.txt", 'a') as f:
                f.write(f'{pmid}\n')

In [4]:
filename = 'non_pmc_1.txt'

with open(filename, 'r') as file:
    pmids = [line.strip() for line in file if line.strip()]

print(pmids)

import pickle
with open("non_pmc.pkl", 'wb') as f:
    pickle.dump(pmids, f)

['38830090', '38857386', '17668480', '38865269', '9499217', '38861602', '38857388', '38843253', '38857403', '38838020', '38833466', '38830098', '8633090', '38830103', '38843184', '20987695', '38861608', '38830112', '38865272', '38857394', '38857390', '38861601', '38830100', '38857397', '38838019', '38838011', '38857392', '38833470', '38861604', '38861594', '38781227', '38833465', '38857400', '38833468', '38865271', '38830107', '38833474', '38861599', '38838013', '34934013', '12046582', '38830095', '38830094', '38833467', '38865275', '38830109', '8992486', '38833473', '38843187', '38865261', '38833475', '38857385', '38857389', '38843252', '38830104', '38865270', '388449', '38857398', '38865274', '38857402', '38861592', '38830097', '20987694', '38865264', '18893735', '38865266', '38838018', '38833464', '38857406', '38830101', '38861605', '38861609', '38838016', '38830111', '38696717', '38833469', '38833472', '38861603', '17360540', '38814876', '38857395', '38848299', '38857407', '3885740

In [5]:
import pickle
with open("non_pmc.pkl", 'rb') as f:
    pmids_without_pmc = pickle.load(f)
print(pmids_without_pmc)

['38830090', '38857386', '17668480', '38865269', '9499217', '38861602', '38857388', '38843253', '38857403', '38838020', '38833466', '38830098', '8633090', '38830103', '38843184', '20987695', '38861608', '38830112', '38865272', '38857394', '38857390', '38861601', '38830100', '38857397', '38838019', '38838011', '38857392', '38833470', '38861604', '38861594', '38781227', '38833465', '38857400', '38833468', '38865271', '38830107', '38833474', '38861599', '38838013', '34934013', '12046582', '38830095', '38830094', '38833467', '38865275', '38830109', '8992486', '38833473', '38843187', '38865261', '38833475', '38857385', '38857389', '38843252', '38830104', '38865270', '388449', '38857398', '38865274', '38857402', '38861592', '38830097', '20987694', '38865264', '18893735', '38865266', '38838018', '38833464', '38857406', '38830101', '38861605', '38861609', '38838016', '38830111', '38696717', '38833469', '38833472', '38861603', '17360540', '38814876', '38857395', '38848299', '38857407', '3885740

In [None]:
from Bio import Entrez
from xml.etree import ElementTree as ET

def check_pmc_ids(pm_id):
    Entrez.email = "relaxchumma@gmail.com"
    
    handle = Entrez.efetch(db="pubmed", id = pm_id, rettype="full", retmode="xml")
    xml_data = handle.read()
    handle.close()
    root = ET.fromstring(xml_data)
    
    pmc_dict = {}
    
    for article in root.findall(".//PubmedArticle"):
        pmid = article.find(".//PMID").text
        pmc_id = None
        for article_id in article.findall(".//ArticleId"):
            if article_id.attrib.get("IdType") == "pmc":
                pmc_id = article_id.text
                break
        pmc_dict[pmid] = pmc_id is not None
    
    return pmc_dict
check_pmc_ids(10377390)

In [None]:
'''import requests
import time
import multiprocessing
import concurrent.futures

def is_pmid_in_pmc(pmid):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
    params = {
        'dbfrom': 'pubmed',
        'db': 'pmc',
        'id': pmid,
        'retmode': 'json'
    }

    try:
        #print("entered")
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()  # Raises a HTTPError if the response was unsuccessful
        data = response.json()
        linksets = data.get('linksets', [])
        if linksets and 'linksetdbs' in linksets[0]:
            for linksetdb in linksets[0]['linksetdbs']:
                if linksetdb['dbto'] == 'pmc':
                    return True
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
    return False

def process_pmid(pmid):
    if is_pmid_in_pmc(pmid):
        #print("True")
        return pmid, True
    else:
        #print("False")
        return pmid, False

def main():
    pmc_count = 0
    not_pmc_count = 0
    pmids = list(uniques)  # Assuming uniques is defined elsewhere

    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        #print("hmm")
        futures = [executor.submit(process_pmid, pmid) for pmid in uniques]
        print(futures)
        for future in concurrent.futures.as_completed(futures):
            #print("oh")
            pmid, is_pmc = future.result()
            if is_pmc:
                pmc_count += 1
                with open("pmc.txt", 'a') as f:
                    f.write(f'{pmid}\n')
            else:
                not_pmc_count += 1
                with open("non_pmc.txt", 'a') as f:
                    f.write(f'{pmid}\n')

    print(f"PMC: {pmc_count}, Non-PMC: {not_pmc_count}")

if __name__ == "__main__":
    main()'''

In [None]:
print(len(pmids))

0


In [None]:
import pickle
with open("store_pmids.pkl", 'wb') as f:
    pickle.dump(pmids, f)

In [None]:
with open('store_pmid.pkl', 'rb') as file:
    total_pmid = pickle.load(file)