# Pull the Journal Articles to use in your Research from Pubmed

In [None]:
# System level imports
import sys
import asyncio
import json
sys.path.append('/Users/dan/Code/Python/pub_worm')

In [None]:
from pub_worm.ncbi.entreze_api import EntrezAPI
from pub_worm.wormbase.wormbase_api import WormbaseAPI

async def get_pmid_for_wbpid(reference):
    wormbase_api = WormbaseAPI("field", "paper", "pmid")
    pmid = wormbase_api.get_wormbase_data(reference['wbp_id'])
    return {**reference, **pmid}


In [None]:
def update_unique_wbp_ids(references, unique_references):
    """
    Updates the set of unique wbp_ids with new IDs from the list of references
    and returns the list of references that have not been seen before.
    """
    new_references = []
    
    # Iterate through the references
    wbp_ids = set(unique_references.keys())
    for ref in references:
        wbp_id = ref["wbp_id"]
        # If the wbp_id is not in the set, add it to the set and add the reference to the new list
        if wbp_id not in wbp_ids:
            #print(ref)
            unique_references[wbp_id]={'title':ref['wbp_title'], 'abstract':ref.get('wbp_abstract',""),'pmcid':0}
            new_references.append(ref)
    
    return new_references, unique_references


In [None]:
# If we did not get the full article from pubmed central (PMC) then save the title and abstract from pubmed
def pmc_not_found(unique_references):
    for unique_reference in unique_references.values():
        if unique_reference['pmcid'] == 0:
            content = f"Title:{unique_reference['title']}\n"
            content += f"Abstract: {unique_reference['abstract']}\n"
            content += f"Content: \n"
            file_nm = f"./output/PM{unique_reference['pmid']}.txt"
            with open(file_nm, 'w') as file:
                file.write(content)
            

In [None]:
async def get_references_wbid(wormbase_id, unique_references):
    results_file_nm = f"./output/{wormbase_id}.json"
    print(f"Processing {wormbase_id}")
    wormbase_api = WormbaseAPI("field", "gene", "references")
    
    # 1. Get all the references for the given wormbase_id
    wormbase_data = wormbase_api.get_wormbase_data(wormbase_id)
    
    # 2a. collect only Journal articles
    if isinstance(wormbase_data['references_list'], dict):
        references = [wormbase_data['references_list']] # Make sure we have a list
    else:
        references = wormbase_data['references_list']        
    journal_articles = [ref for ref in references if ref['wbp_type'] == 'Journal article' ]
    
    # 2b. collect only articles that we have not seen
    journal_articles, unique_references = update_unique_wbp_ids(journal_articles, unique_references)

    # 3a. Get the associated Pubmed Ids
    pmid_for_wbpid_list = await asyncio.gather(*[get_pmid_for_wbpid(ref) for ref in journal_articles])
    # 3b. Create a lookup table for pmid to bwpid
    pmid_to_bwpid_lookup = {pmid_for_wbpid['pm_id']: pmid_for_wbpid['wbp_id'] for pmid_for_wbpid in pmid_for_wbpid_list}
    # 3c. Add PubMed Ids to the unique_references
    for pmid_for_wbpid in pmid_for_wbpid_list:
        wbp_id = pmid_for_wbpid['wbp_id']
        unique_references[wbp_id]['pmid']= pmid_for_wbpid['pm_id']

    # 4. Extract the pubmed ids into a list
    pmid_list = [pmid_for_wbpid['pm_id'] for pmid_for_wbpid in pmid_for_wbpid_list]
    
    # 5. Post the list to ncbi entrez
    ncbi_api = EntrezAPI()
    entreze_epost_result = ncbi_api.entreze_epost(pmid_list)
    
    # 6. Fetch the full articles
    if 'WebEnv' in entreze_epost_result:
        # 6a. Link pubmed ids to pmcids
        elink_result = ncbi_api.entreze_elink_pmid_to_pmcid(entreze_epost_result)
        params= {'db': 'pmc'}
        # 6b. post the pmcids
        epost_result = ncbi_api.entreze_epost(elink_result, params)
        # 6c. Fetch the articles based on the pmcids
        efetch_result = ncbi_api.entreze_efetch(epost_result)
        
        # 6d. Write the content of the paper to a file
        for article in efetch_result['articles']:
            content = f"Title:{article['title']}\n"
            content += f"Abstract: {article['abstract']}\n"
            content += f"Content: {article['body']}\n"
            file_nm = f"./output/PMC{article['pmcid']}.txt"
            with open(file_nm, 'w') as file:
                file.write(content)
            wbp_id = pmid_to_bwpid_lookup[article['pmid']]
            unique_references[wbp_id]['pmcid'] = article['pmcid']
            
    
    # #print(f"unique_references {len(unique_references)}")
    #print(json.dumps(journal_articles, indent=4))
    return unique_references

In [None]:
#gene_set = ["WBGene00008850","WBGene00001463"]
gene_set = ["WBGene00016064", "WBGene00001463", "WBGene00001452", "WBGene00002048", "WBGene00003750", "WBGene00006575", "WBGene00006783",
            "WBGene00019327", "WBGene00008850", "WBGene00019967", "WBGene00001452", "WBGene00001752", "WBGene00002048", "WBGene00003640", 
            "WBGene00007867", "WBGene00007875", "WBGene00008010", "WBGene00008584", "WBGene00008681", "WBGene00009429", "WBGene00016596", 
            "WBGene00019619", "WBGene00010290", "WBGene00000399", "WBGene00001430", "WBGene00010493", "WBGene00004512", "WBGene00004513", 
            "WBGene00004622"]
references_index = {}
for wormbase_id in gene_set:
    references_index = await get_references_wbid(wormbase_id, references_index)

pmc_not_found(references_index)

print(json.dumps(references_index, indent=4))
with open("./output/references_index.json", 'w') as file:
    json.dump(references_index, file, indent=4)