## Extract gene ontology (GO Term) data for all protein coding genes from Wormbase

In [4]:
import pandas as pd
import time

wormbase_version="WS293"
gene_ids_df = pd.read_csv(f"./wormbase_data/c_elegans.PRJNA13758.{wormbase_version}.geneIDs.csv") 

# # Extract only the protein coding genes
protein_coding_genes_df = gene_ids_df[gene_ids_df["Gene_Type"].isin(["protein_coding_gene", "gene"])]
# Note: only 15 Genes from the 27,667 not_protein_coding_genes_df have Ontology Terms
not_protein_coding_genes_df = gene_ids_df[~gene_ids_df["Gene_Type"].isin(["protein_coding_gene", "gene"])]

# Check for the expected counts
assert len(protein_coding_genes_df) == 21_506
assert len(not_protein_coding_genes_df) == 27_667


# UTILITY FUNCTIONS
# Track the time to make a function call
def formatted_elapsed_time(start,end=None):
    minute=60
    hour  =60 * minute

    if end == None:
        end = time.time()
    total_seconds = end - start
    hours = total_seconds // hour
    minutes = (total_seconds % hour) // minute
    seconds = (total_seconds % hour) % minute
    return f'Time: {hours=} {minutes=} {seconds=:.2f}'



## Wormbase API Calls

* Get the gene ontology data for all protein coding genes

<span style="color:red">Note: Executing the below cell will take approximately 9 Minutes to run.</span>


In [5]:
import time
from pub_worm.wormbase.wormbase_api import WormbaseAPI

start_time = time.time()

# Set the API Class to get gene ontology data from Wormbase
wormbase_api = WormbaseAPI("field", "gene", "gene_ontology_summary")

# Test Genes
#genes_to_process = ["WBGene00000001",  "WBGene00000002", "WBGene00000003", "WBGene00000004"]

genes_to_process = list(protein_coding_genes_df['Wormbase_Id'])

# This is a Multi-process call using 10 CPUs
wormbase_data_results = wormbase_api.get_wormbase_data_cpu(genes_to_process, 10)
print(formatted_elapsed_time(start_time))


Check if you have a connection!! | Retry- 1 | Response msg- <urlopen error [Errno 60] Operation timed out>
Check if you have a connection!! | Retry- 1 | Response msg- <urlopen error [Errno 60] Operation timed out>
Time: hours=0.0 minutes=5.0 seconds=18.09


In [6]:
def ontology_json_to_list(wormbase_id, json_obj):
    rows = []
    row = []
    for category, cat_lst in json_obj.items():
        row = [wormbase_id]
        if isinstance(cat_lst, dict):
            cat_lst = [cat_lst]
        for cat_lst_item in cat_lst:
            row.append(cat_lst_item['go_id'])
            row.append(category)
            row.append(cat_lst_item['go_term'])
            rows.append(row)
            row = [wormbase_id]
    return rows

In [7]:
# process wormbase_data_results write data to a CSV file

ontology_full_list = []
for result_item in wormbase_data_results:
    key = list(result_item.keys())[0]
    if 'gene_ontology_summary' in result_item[key]:
        #print( result_item[key])
        value = result_item[key]['gene_ontology_summary']
        ontology_list = ontology_json_to_list(key, value)
        ontology_full_list.extend(ontology_list)
    
df = pd.DataFrame(ontology_full_list)
df.columns=["Wormbase_Id", "Go_Id", "Category", "Term"]
df.to_csv(f"wormbase_data/gene_ontology_{wormbase_version}.csv", index=False)

In [8]:
# Get Unique GO Terms 
gene_ontology_df = pd.read_csv(f"./wormbase_data/gene_ontology_{wormbase_version}.csv")
gene_ontology_df = gene_ontology_df[["Go_Id","Category","Term"]]
gene_ontology_df = gene_ontology_df.drop_duplicates(subset='Go_Id', keep='first')
gene_ontology_df = gene_ontology_df.sort_values(by=['Category','Go_Id'])
gene_ontology_df.to_csv(f"./wormbase_data/ontology_{wormbase_version}.csv", index=False)