## Extract paper references for all genes from Wormbase

In [3]:
import pandas as pd
import time
import psutil
import os
import numpy as np

wormbase_version="WS293"
gene_ids_df = pd.read_csv(f"./wormbase_data/c_elegans.PRJNA13758.{wormbase_version}.geneIDs.csv") 


# UTILITY FUNCTIONS
# Track the time to make a function call
def formatted_elapsed_time(start,end=None):
    minute=60
    hour  =60 * minute

    if end == None:
        end = time.time()
    total_seconds = end - start
    hours = total_seconds // hour
    minutes = (total_seconds % hour) // minute
    seconds = (total_seconds % hour) % minute
    return f'Time: {hours=} {minutes=} {seconds=:.2f}'

# Function to monitor memory usage
def get_memory_usage():
    cpu_percent=psutil.cpu_percent()
    memory_percent=psutil.virtual_memory().percent
    memory_available=psutil.virtual_memory().available / (1024 ** 3)
    return f"CPU {cpu_percent}% Memory {memory_percent}% Mem Avail {memory_available:,.2f} GB"


In [4]:

def reference_json_to_list(wormbase_id, json_obj):
    rows = []
    row = []
    for item in json_obj:
        row = [wormbase_id]
        row.append(item.get('wbp_id',''))
        row.append(item.get('wbp_title',''))
        row.append(item.get('wbp_type',''))
        row.append(item.get('wbp_journal',''))
        row.append(item.get('wbp_year',''))
        row.append(item.get('wbp_author',''))
        row.append(item.get('wbp_abstract',''))
        rows.append(row)
    return rows

def save_df_to_csv(file_path, df):
    # If the file exists we append to it
    mode = 'a' if os.path.exists(file_path) else 'w'
    header = not os.path.exists(file_path)
    df.to_csv(file_path, mode=mode, header=header, index=False)
    
def wormbase_references_to_csv(wormbase_data_results, gene_reference_file, reference_abstracts_file):
    references_full_list = []
    for result_item in wormbase_data_results:
        key = list(result_item.keys())[0]
        if 'references_list' in result_item[key]:
            value = result_item[key]['references_list']
            if isinstance(value, dict):
                value = [value]
            reference_list = reference_json_to_list(key, value)
            references_full_list.extend(reference_list)
    
    references_full_df = pd.DataFrame(references_full_list)
    column_names = ["Wormbase_Id", "WBP_Id", "WBP_title", "WBP_type","WBP_journal","WBP_year","WBP_author","WBP_abstract"]
    if len(references_full_df.columns) == len(column_names):
        references_full_df.columns = column_names
        
        reference_abstracts_df = references_full_df[["WBP_Id", "WBP_title", "WBP_type","WBP_journal","WBP_year","WBP_author", "WBP_abstract"]].drop_duplicates(subset="WBP_Id")
        save_df_to_csv(reference_abstracts_file, reference_abstracts_df)
        
        references_full_df = references_full_df[["Wormbase_Id", "WBP_Id"]]
        save_df_to_csv(gene_reference_file, references_full_df)
    else:
        print("Found No References!!")


## Wormbase API Calls

* Get the paper references for all genes

<span style="color:red">Note: Executing the below cell will take approximately 16 Minutes to run.</span>

In [5]:
# import time
# import json
# import warnings
# from pub_worm.wormbase.wormbase_api import WormbaseAPI

# warnings.simplefilter(action='ignore', category=FutureWarning)

# gene_reference_file = f"wormbase_data/gene_references_{wormbase_version}.csv"
# os.remove(gene_reference_file) if os.path.exists(gene_reference_file) else None
    
# reference_abstracts_file = f"wormbase_data/references_{wormbase_version}.csv"
# os.remove(reference_abstracts_file) if os.path.exists(reference_abstracts_file) else None        

# # Set the API Class to get gene ontology data from Wormbase
# wormbase_api = WormbaseAPI("field", "gene", "references")

# # Test Genes
# #gene_ids_chunks = [["WBGene00000001",  "WBGene00000002", "WBGene00000003", "WBGene00000004"]]

# chunk_size=1000
# gene_ids_chunks = [gene_ids_df['Wormbase_Id'][i:i+chunk_size] for i in range(0, len(gene_ids_df), chunk_size)]
# total_chucks = len(gene_ids_chunks)

# total_start_time = time.time()
# for index, chunk in enumerate(gene_ids_chunks):
#     start_time = time.time()
#     wormbase_data_results = wormbase_api.get_wormbase_data_cpu(chunk, 8)
#     print(f"{index+1} of {total_chucks} | {formatted_elapsed_time(start_time)} | {get_memory_usage()}")
#     # with open('output.json', 'w') as f:
#     #     json.dump(wormbase_data_results, f, indent=4)
#     # break
#     wormbase_references_to_csv(wormbase_data_results, gene_reference_file, reference_abstracts_file)

# print("="*95)
# print(f"{formatted_elapsed_time(total_start_time)} {get_memory_usage()}")


In [6]:
# Dedup the references and save
references_file_nm = f"./wormbase_data/references_{wormbase_version}.csv"
references_df = pd.read_csv(references_file_nm) 
references_df = references_df.drop_duplicates(subset='WBP_Id', keep='first').sort_values(by='WBP_Id')
references_df.to_csv(references_file_nm, index=False)

In [5]:
references_df['WBP_type'].value_counts()

WBP_type
Meeting abstract             15999
Journal article              15659
Gazette article               2072
Review                        1256
Micropublication               205
Comment                        131
Book chapter                   102
Published erratum               67
Letter                          48
News                            45
Wormbook                        32
Editorial                       18
Method                           9
Retracted publication            8
Congresses                       6
Retraction of publication        5
Historical article               3
Book                             2
Lectures                         1
Interactive tutorial             1
Other                            1
Name: count, dtype: int64

## Wormbase API Calls

* Get the pubmed IDs for all Journal articles

<span style="color:red">Note: Executing the below cell will take approximately 5 Minutes to run.</span>

In [19]:
import time
from pub_worm.wormbase.wormbase_api import WormbaseAPI

start_time = time.time()

# Set the API Class to get gene ontology data from Wormbase
wormbase_api = WormbaseAPI("field", "paper", "pmid")


wbp_ids = references_df.loc[references_df['WBP_type'] == 'Journal article', 'WBP_Id'].tolist()
print(len(wbp_ids))

# Test Genes
#wbp_ids = ["WBGene00000001",  "WBGene00000002", "WBGene00000003", "WBGene00000004"]

# This is a Multi-process call using 10 CPUs
wormbase_data_results = wormbase_api.get_wormbase_data_cpu(wbp_ids, 10)
print(formatted_elapsed_time(start_time))


15659
Check if you have a connection!! | Retry- 1 | Response msg- <urlopen error [Errno 60] Operation timed out>
Check if you have a connection!! | Retry- 1 | Response msg- <urlopen error [Errno 60] Operation timed out>
Time: hours=0.0 minutes=4.0 seconds=9.91


In [18]:
# Added the found PubMed Ids to the references_df and save 
found_pubmed_ids_list = [{"WBP_Id": key, "PM_Id": value.get("pm_id", "")} 
                    for item in wormbase_data_results 
                    for key, value in item.items()]

found_pubmed_ids_list_df = pd.DataFrame(found_pubmed_ids_list)
references_df['PM_Id'] = references_df['WBP_Id'].map(found_pubmed_ids_list_df.set_index('WBP_Id')['PM_Id'])
references_df['PM_Id'] = references_df['PM_Id'].fillna("")

references_file_nm = f"./wormbase_data/references_{wormbase_version}.csv"
references_df.to_csv(references_file_nm, index=False)

In [4]:
# Read the reference data and select the pm_ids
references_file_nm = f"./wormbase_data/references_{wormbase_version}.csv"
references_file_df = pd.read_csv(references_file_nm, converters={'PM_Id': str}) 

pm_id_list = references_file_df['PM_Id'].replace('', np.nan).dropna().tolist()
print(len(pm_id_list))

15570


## Entrez API Calls

* Get Additional Ids from Pubmed using EntrezAPI

<span style="color:red">Note: Executing the below cell will take approximately 2 Minutes to run.</span>

In [5]:
# Get Additional Ids from Pubmed using EntrezAPI
from pub_worm.ncbi.entreze_api import EntrezAPI

chunk_size=2000
pm_ids_chunks = [pm_id_list[i:i+chunk_size] for i in range(0, len(pm_id_list), chunk_size)]
total_chucks = len(pm_ids_chunks)

# Example call to ePost and eSummary
#pm_ids_chunks = [["10021351", "10022905", "10022914", "10022975"]]

start_time = time.time()
entreze_summaries_result_full = []
for index, chunk in enumerate(pm_ids_chunks):
    ncbi_api = EntrezAPI()
    entreze_epost_result = ncbi_api.entreze_epost(chunk)
    if 'WebEnv' in entreze_epost_result:
        entreze_summaries_result = ncbi_api.entreze_pmid_summaries(entreze_epost_result)
        entreze_summaries_result_full.extend(entreze_summaries_result)
        #pretty_data = json.dumps(entreze_summaries_result, indent=4)
        print(f"{index+1} of {total_chucks} chunk size {len(chunk)}")
print(formatted_elapsed_time(start_time))    
print(len(entreze_summaries_result_full))


1 of 8 chunk size 2000
2 of 8 chunk size 2000
3 of 8 chunk size 2000
4 of 8 chunk size 2000
5 of 8 chunk size 2000
6 of 8 chunk size 2000
7 of 8 chunk size 2000
8 of 8 chunk size 1570
Time: hours=0.0 minutes=2.0 seconds=1.11
15570


In [8]:
# Get the impact factor for all references with an issn or essn
from pub_worm.impact_factor.impact_factor_lookup import get_impact_factor

summaries_result = []
for reference in entreze_summaries_result_full:
    issn = reference.get('issn') or reference.get('essn')
    impact_factor = get_impact_factor(issn)
    reference['impact_factor'] = impact_factor
    summaries_result.append(reference)
    

assert len(entreze_summaries_result_full) == len(summaries_result)

summaries_result_df = pd.DataFrame(summaries_result)
summaries_result_df = summaries_result_df.rename(columns={'uid': 'pm_id'})
summaries_result_df.to_csv(f"./wormbase_data/references_pubmed_{wormbase_version}.csv", index=False)
summaries_result_df


Unnamed: 0,pm_id,issn,essn,last_author,pmc_id,title,source,impact_factor
0,631425,0012-1606,,White JG,,Cell cycling and DNA replication in a mutant b...,Dev Biol,3.148
1,4366476,0016-6731,,Brenner S,PMC1213120,The genetics of Caenorhabditis elegans.,Genetics,4.402
2,988846,0012-1606,,Russell RL,,The life cycle of the nematode Caenorhabditis ...,Dev Biol,3.148
3,730048,0016-6731,,Russell RL,PMC1213887,Osmotic avoidance defective mutants of the nem...,Genetics,4.402
4,19308247,0022-300X,,Dusenbery DB,PMC2620198,Chemotactic responses of male Caenorhabditis e...,J Nematol,1.481
...,...,...,...,...,...,...,...,...
15565,38331287,0048-9697,1879-1026,Wang D,,Polyethylene nanoparticles at environmentally ...,Sci Total Environ,10.753
15566,22132858,0264-6021,1470-8728,Zheng X,,The crystal structure of human UDP-glucose pyr...,Biochem J,3.766
15567,38531838,1547-6286,1555-8584,Zhao D,PMC10978027,Tissue-specific silencing of integrated transg...,RNA Biol,4.766
15568,38529797,1474-9718,1474-9726,Lee SV,PMC11258480,Combinatorial transcriptomic and genetic disse...,Aging Cell,11.005


# Appendix

In [2]:
!pip install --upgrade pub_worm

Collecting pub_worm
  Downloading pub_worm-0.3.1.tar.gz (395 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.0/395.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pub_worm
  Building wheel for pub_worm (setup.py) ... [?25ldone
[?25h  Created wheel for pub_worm: filename=pub_worm-0.3.1-py3-none-any.whl size=400539 sha256=d39588fbc8c63ca332f92de262145e60156ce9d4715a1746b750fe04a470b491
  Stored in directory: /Users/dan/Library/Caches/pip/wheels/c0/71/88/30f94af3a71d87bfd161a88ae3fa9980c8291f850d02e88b3a
Successfully built pub_worm
Installing collected packages: pub_worm
  Attempting uninstall: pub_worm
    Found existing installation: pub-worm 0.3.0
    Uninstalling pub-worm-0.3.0:
      Successfully uninstalled pub-worm-0.3.0
Successfully installed pub_worm-0.3.1
