### PMIDs to PMCs

In [1]:
import json, requests as req, pandas as pd
from re import search
from multiprocessing import cpu_count, Process

In [None]:
def multiprocess_a_list(thelist, the_function):
    '''
    FUNCTION:
    - This takes a list of strings and splits it into input
      for separate processes. The processes then output 
      their results to temp files which are then merged. 
    
    PARARMS:
    - thelist: the list to be split into input for 
      a multiprocessing function
    - the_function: the function that will use the list
      as input for multiprocessing
    '''
    
    # How many processors can be used
    procs = cpu_count()

    # List of batches for multiprocessing
    batches = [[] for i in range(procs)]   

    # Length of input dictionary 
    tot = len(thelist)

    # Create batches and send to multiprocessing
    for i, item in enumerate(thelist):

        # Add synonym to a batch
        b_id = i%procs
        batches[b_id].append(item)

    # Create a list of jobs 
    print("Running jobs...")
    jobs = []
    for b_id, batch in enumerate(batches):
        jobs.append(Process(target = the_function, \
                            args = [b_id, batch]))

    # Run the jobs
    for j in jobs: j.start()
    for j in jobs: j.join()
    print('Done!')
    
    
def switch_dictset_to_dictlist(the_dict):
    '''
    FUNCTION:
    - Make a new dictionary with values as lists 
      instead of values as sets
      
    PARAMS:
    - the_dict: The initial dict with values of sets
    '''
    
    dictlist = dict()
    
    for k in the_dict.copy():
        dictlist[k] = list(the_dict[k])
        
    return dictlist

In [9]:
def map_pmc_pmids(batch_id, pmids):
    '''
    FUNCTION:
    - Map a batch of PMIDs to PMCs
    - API Source: https://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/
    
    PARAMS:
    - batch_id: The batch ID
    - pmids (list): the batch of PMIDs
    '''
    url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids='
    tot_reqs = int(len(pmids)/200)+1
    temp_outfile = 'data/pmc_pmid_temp_'+str(batch_id)+'.txt'
    
    with open(temp_outfile,'w') as fout:
        for i in range(0, int(len(pmids)/200)+1):

            # Print progress
            if batch_id == 1:
                print(str('Progress of batch 1: '+str(i)+'/'+str(tot_reqs)), end='\r')  

            # Submit batch of up to 200 PMIDs
            if i < int(len(pmids)/200)+1:
                r = req.get(url+','.join(pmids[i*200:(i+1)*200])+'&format=json')
            else:
                r = req.get(url+','.join(pmids[i*200:])+'&format=json')

            # Map all PMID-PMCs
            records = r.json()['records']
            for record in records:
                try:
                    pmc = record['pmcid']
                    pmid = record['pmid']
                    fout.write(pmid+'|'+pmc+'\n')
                except:
                    break
                    
# Uses threading to parallelize the process and speed it up
pmids = json.load(open('data/pmids.json'))
multiprocess_a_list(pmids, map_pmc_pmids)

Running jobs...
Done!ess of batch 1: 722/723


In [15]:
def merge_pmid_pmc_mapping(pmid2pmc, pmc2pmid):
    '''Merge the files output by the parallel processes
    
    PARAMS:
    - pmid2pmc: Dictionary to map the PMIDs to the PMCs
    - pmc2pmid: Dictionary to map the PMCs to the PMIDs
    '''
    procs = cpu_count()

    for batch_id in range(procs):
        temp_outfile = 'data/pmc_pmid_temp_'+str(batch_id)+'.txt'
        with open(temp_outfile) as fin:
            for line in fin:
                line = line.split('|')
                pmid = line[0]
                pmc = line[1].replace('PMC','').strip()
                pmid2pmc.setdefault(pmid, set()).add(pmc)
                pmc2pmid.setdefault(pmc, set()).add(pmid)
            
    return pmid2pmc, pmc2pmid
    
    
pmid2pmc, pmc2pmid = dict(), dict()
pmid2pmc, pmc2pmid = merge_pmid_pmc_mapping(pmid2pmc, pmc2pmid)
    
# Checking that the PMID-PMC is 1-to-1
for pmid,pmc in pmid2pmc.items():
    assert len(pmc) == 1
for pmc,pmid in pmc2pmid.items():
    assert len(pmid) == 1
    
# Making the PMID-PMC not have list values
pmc2pmid = switch_dictset_to_dictlist(pmc2pmid)
pmid2pmc = switch_dictset_to_dictlist(pmid2pmc)

for pmid,pmc in pmid2pmc.copy().items():
    pmid2pmc[pmid] = pmc[0]
for pmc,pmid in pmc2pmid.copy().items():
    pmc2pmid[pmc] = pmid[0]  

'''Export the PMC-PMID mappings'''
json.dump(pmc2pmid, open('data/pmc2pmid','w'))
json.dump(pmid2pmc, open('data/pmid2pmc','w'))

assert len(pmid2pmc) == len(pmc2pmid)
print('Mapped', len(pmid2pmc), 'IDs')

Mapped 150697 IDs


In [None]:
'''The non-parallelized version (slower)'''

def map_pmc_pmids(batch_id, pmids):
    '''
    FUNCTION:
    - Map a batch of PMIDs to PMCs
    
    PARAMS:
    - batch_id: The batch ID
    - pmids (list): the batch of PMIDs
    '''
    
    pmids = json.load(open('data/pmids.json'))
    url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids='
    tot_reqs = int(len(pmids)/200)+1

    for i in range(0, int(len(pmids)/200)+1):

        # Print progress
        print(i,'/',tot_reqs, end='\r')  

        # Submit batch of up to 200 PMIDs
        if i < int(len(pmids)/200)+1:
            r = req.get(url+','.join(pmids[i*200:(i+1)*200])+'&format=json')
        else:
            r = req.get(url+','.join(pmids[i*200:])+'&format=json')

        # Map all PMID-PMCs
        records = r.json()['records']
        for record in records:
            try:
                pmc = record['pmcid']
                pmid = record['pmid']
                pmid2pmc.setdefault(pmid, set()).add(pmc)
                pmc2pmid.setdefault(pmc, set()).add(pmid)
            except:
                break