# Get All the Ontology data for a Gene

### This notebook pulls all the Gene Ontology records for all the Wormcat Genes

* Except for genes in the "Non-coding RNA", "Pseudogene" Categories of WormCat

---
All other data is queried by WormCat category and pulled into seperate csv files.


In [24]:
import numpy as np
import pandas as pd
import datetime
import math
import requests
import json
import csv
import time
import os
import psutil
from datetime import datetime

# Get the API
from pub_worm.wormbase.wormbase_api import WormbaseAPI
from pub_worm.wormbase.to_csv_helpers import ontology_to_csv, refereneces_to_csv

In [25]:
%%bash
# Pull Wormcat Category List from source location as needed
cd /media/data1/Code/Notebooks/UMass_Med/unknown_genes/input_data

WORMCAT_DB="whole_genome_v2_nov-11-2021.csv"
WORMCAT_REPO=" http://www.wormcat.com/static/download"

if [ -f "$WORMCAT_DB" ]; then
   echo "File exists and will not be pulled."
else
   echo "File does not exist locally pulling from repo."
   wget -q ${WORMCAT_REPO}/${WORMCAT_DB}
fi


File exists and will not be pulled.


In [26]:
# Load the Wormbase Annotation List

wormcat_df = pd.read_csv('./input_data/whole_genome_v2_nov-11-2021.csv') 
wormcat_df = wormcat_df.rename(columns={'Sequence ID':'wc_sequence_id','Wormbase ID':'wormbase_id','Category 1':'category_1','Category 2':'category_2','Category 3':'category_3'})
wormcat_df = wormcat_df.drop(columns=['Automated Description'])
print(f"{len(wormcat_df):,}")
wormcat_df.head()

31,389


Unnamed: 0,wc_sequence_id,wormbase_id,category_1,category_2,category_3
0,F15H10.3,WBGene00000144,Cell cycle,Cell cycle: APC,Cell cycle: APC
1,F35G12.9,WBGene00000145,Cell cycle,Cell cycle: APC,Cell cycle: APC
2,C09H10.7,WBGene00007501,Cell cycle,Cell cycle: APC,Cell cycle: APC
3,K06H7.6,WBGene00000143,Cell cycle,Cell cycle: APC,Cell cycle: APC
4,B0511.9,WBGene00015235,Cell cycle,Cell cycle: APC,Cell cycle: APC


In [19]:
# Load each Category into its own dataframe
category1=wormcat_df['category_1'].unique()
category_dfs = {}
for category in category1:
    df = wormcat_df.query(f"category_1 == '{category}'")
    category_dfs[category]=df


In [20]:
# List to Subdivide the Categories
category_names=["Cell cycle", "Chaperone", "Cilia", "Cytoskeleton", "Development", "DNA", "Extracellular material", "Globin", "Lysosome", 
                "Major sperm protein", "Metabolism", "mRNA functions", "Muscle function", "Neuronal function", 
                "Nuclear pore", "Nucleic acid", "Peroxisome", "Protein modification", "Proteolysis general", "Proteolysis proteasome", 
                "Ribosome", "Signaling", "Stress response", "Trafficking", "Transcription factor", "Transcription: chromatin", 
                "Transcription: dosage compensation", "Transcription: general machinery"]

category_pag_names=["Transcription: unassigned", "Transmembrane protein", "Transmembrane transport", "Unassigned"]
category_not_covered_names=["Non-coding RNA", "Pseudogene"]

len(category_names+category_pag_names+category_not_covered_names)    

34

In [22]:
# How many wormbase ids for each category?
def df_summary(dict_of_dfs, dict_keys):
    total=0
    count=0
    for key in sorted(dict_keys):
        length = len(dict_of_dfs[key])
        print(f"{key:<35} {length:>8,}")
        total += length
        count += 1

    print("="*45)
    print(f"Count:{count:>4} {total:>33,}")
    
df_summary(category_dfs, category_names)

Cell cycle                               172
Chaperone                                 92
Cilia                                     60
Cytoskeleton                             371
DNA                                      176
Development                              295
Extracellular material                   495
Globin                                    36
Lysosome                                  91
Major sperm protein                       31
Metabolism                             1,601
Muscle function                           62
Neuronal function                        308
Nuclear pore                              36
Nucleic acid                             231
Peroxisome                                15
Protein modification                     271
Proteolysis general                      394
Proteolysis proteasome                   733
Ribosome                                 244
Signaling                              1,188
Stress response                          833
Traffickin

In [23]:
# How many wormbase ids for each category of Poorly Annotated genes?
df_summary(category_dfs, category_pag_names)


Transcription: unassigned                 15
Transmembrane protein                  3,200
Transmembrane transport                  901
Unassigned                             6,343
Count:   4                            10,459


In [9]:
# Function to monitor memory usage
def print_memory_usage():
    cpu_percent=psutil.cpu_percent()
    memory_percent=psutil.virtual_memory().percent
    memory_available=psutil.virtual_memory().available / (1024 ** 3)
    print(f"CPU {cpu_percent}% Memory {memory_percent}% Mem Avail {memory_available:,.2f} GB")
print_memory_usage()

CPU 0.5% Memory 50.8% Mem Avail 61.78 GB


In [10]:
!pip install --upgrade pub_worm



In [16]:
def ontology_json_to_dataframe(json_obj, wormbase_id, file_name=None):
    rows = []
    row = []
    for category, cat_lst in json_obj.items():
        #print(f"{category=}")
        #print(f"{cat_lst=}")
        row = [wormbase_id]
        if isinstance(cat_lst, dict):
            row.append(cat_lst['id'])
            row.append(category)
            row.append(cat_lst['name'])
            rows.append(row)
            row = [wormbase_id]
        else:
            for cat_lst_item in cat_lst:
                #print(f"{cat_lst_item=}")
                row.append(cat_lst_item['id'])
                row.append(category)
                row.append(cat_lst_item['name'])
                rows.append(row)
                row = [wormbase_id]

    df = pd.DataFrame(rows)
    df.columns=["Wormbase_Id","Go_Id","Category","Name"]
    if file_name:
        df.to_csv(file_name, index=False)
    return df

def get_ontology_data(wormcat_df, category_nm):
    formatted_date = datetime.today().strftime('%Y_%m_%d')
    file_name = f"{category_nm.lower().replace(' ', '_')}_references_{formatted_date}.csv"
    
    wormbase_api = WormbaseAPI("field", "gene", "gene_ontology_summary")
    
    concatenated_df = pd.DataFrame()
    dfs = []
    index=0
    number_of_rows=len(wormcat_df)
    for df_index, row in wormcat_df.iterrows():
        print(".", end='')
        index +=1
        #print(f"{index:<4} of {len(transmembrane_transport_df)} {row['wormbase_id']}")
        ret_data = wormbase_api.get_wormbase_data(row['wormbase_id'])
        if 'gene_ontology_summary' in ret_data:
            df = ontology_json_to_dataframe(ret_data['gene_ontology_summary'], row['wormbase_id'])
            dfs.append(df)

        # Concatenate every 100 DataFrames
        # If something crashes we may be able to recover without a full rerun
        if index % 100 == 0:
            print(f"{index:>4} of {number_of_rows} {row['wormbase_id']}")
            concatenated_df = pd.concat([concatenated_df] + dfs, ignore_index=True)
            concatenated_df.to_csv(file_name, index=False)
            dfs = []  # Reset the list for the next batch

    # Concatenate the remaining DataFrames
    if dfs:
        concatenated_df = pd.concat([concatenated_df] + dfs, ignore_index=True)
        concatenated_df.to_csv(file_name, index=False)
    return concatenated_df




In [17]:
### CAUTION This cell can take a long time to run ###
# Get all the Papers for a Category of Wormcat

ontology_data_full_df = pd.DataFrame()
for name in category_pag_names:
    print(f"{name} {len(category_dfs[name])}")
    temp_df = get_ontology_data(category_dfs[name], name)   
    ontology_data_full_df = pd.concat([ontology_data_full_df, temp_df], ignore_index=True)
    

Transcription: unassigned 15
...............Transmembrane protein 3200
.................................................................................................... 100 of 3200 WBGene00007336
.................................................................................................... 200 of 3200 WBGene00015949
.................................................................................................... 300 of 3200 WBGene00008171
.................................................................................................... 400 of 3200 WBGene00044406
.................................................................................................... 500 of 3200 WBGene00017675
.................................................................................................... 600 of 3200 WBGene00044556
.................................................................................................... 700 of 3200 WBGene00010003
.................................

KeyboardInterrupt: 

In [138]:
reference_data_full_df.columns

Index(['id', 'title', 'journal', 'year', 'author', 'wormbase_id', 'category'], dtype='object')

In [139]:
reference_data_full_df['category'].unique()

array(['Major sperm protein', 'Metabolism', 'mRNA functions',
       'Muscle function', 'Neuronal function', 'Nuclear pore',
       'Nucleic acid', 'Peroxisome', 'Protein modification',
       'Proteolysis general', 'Proteolysis proteasome', 'Ribosome',
       'Signaling', 'Stress response', 'Trafficking',
       'Transcription factor', 'Transcription: chromatin',
       'Transcription: dosage compensation',
       'Transcription: general machinery'], dtype=object)

# After all categories are processed

In [21]:
# Read all the csv files of references into dataframes
directory = './output_data2'
reference_dfs = {}

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        category = df['category'].unique()[0]
        reference_dfs[category] = df


In [29]:
df_summary(dfs, dfs.keys())

Transmembrane transport               10,219
Lysosome                               1,527
Chaperone                                820
Transcription: dosage compensation     1,128
Neuronal function                     11,956
Cytoskeleton                           7,176
Nucleic acid                           1,735
Unassigned                             4,527
Peroxisome                               115
Nuclear pore                             528
Transcription factor                  11,835
mRNA functions                         6,534
Extracellular material                 6,582
Cilia                                  1,851
Metabolism                            12,635
Protein modification                   1,967
Major sperm protein                      147
Globin                                   125
Trafficking                            2,561
Transmembrane protein                  4,633
Signaling                             26,543
Transcription: general machinery       1,548
Transcript

In [31]:
# Concat all the dataframes into on large dataframe
reference_data_full_df = pd.DataFrame()
for name in dfs:  
    reference_data_full_df = pd.concat([reference_data_full_df, dfs[name]], ignore_index=True)
print(f"Total count of Papers {len(reference_data_full_df):>6,}")

Total count of Papers 164,676


In [30]:
# How many Unique papers do we have?
unquie_papers = reference_data_full_df['id'].unique()
print(f"Unique Papers {len(unquie_papers):,}")

Unique Papers 34,518


In [7]:
# Function to Get Abstracts for a collection of Papers
def get_abstracts(paper_ids, file_nm):
    formatted_date = datetime.today().strftime('%Y_%m_%d')
    file_name = f"{file_nm.lower().replace(' ', '_')}_references_{formatted_date}.csv"
    wormbase_abstract = WormbaseAPI("field", "paper", "abstract")
    concatenated_df = pd.DataFrame()
    dfs = []
    index=0
    number_of_rows=len(paper_ids)
    for  paper_id in paper_ids:
        #print(f"{index:>4} of {number_of_rows} {paper_id}")
        print(".", end='')
        index +=1
        #print(f"{index:<4} of {len(transmembrane_transport_df)} {row['wormbase_id']}")
        abstract_data = wormbase_abstract.get_wormbase_data(paper_id)
        if abstract_data:
                abstract_data_dict ={'paper_id':paper_id}
                abstract_data_dict.update(abstract_data)
                abstract_data_df = pd.DataFrame(abstract_data_dict, index=[0])
                dfs.append(abstract_data_df)
        else:
            print("-", end='')
            #print(f"Return has no references_list!\n{ret_data}")

        # Concatenate every 100 DataFrames
        # If something crashes we may be able to recover without a full rerun
        if index % 500 == 0:
            print(f"{index:>4} of {number_of_rows} {paper_id}")
            concatenated_df = pd.concat([concatenated_df] + dfs, ignore_index=True)
            concatenated_df.to_csv(file_name, index=False)
            print_memory_usage()
            dfs = []  # Reset the list for the next batch

    # Concatenate the remaining DataFrames
    if dfs:
        concatenated_df = pd.concat([concatenated_df] + dfs, ignore_index=True)
        concatenated_df.to_csv(file_name, index=False)
        print_memory_usage()
    return concatenated_df

In [None]:
abstract_df = get_abstracts(unquie_papers, "abstract")
print(f"Size {len(abstract_df)}")

In [82]:
%%capture notebook_variables
%whos
# What Vaiables are in use

In [83]:
def sorted_whos(notebook_variables):
    pos=0
    line=""
    lines=[]
    for item in notebook_variables.stdout:
        if item=='\n':
            lines.append(line)
            line = ""
        else:
            line +=item

    records=[]
    for line in lines:
        variable =line[0:29].strip()
        type_nm  =line[29:42].strip()
        data_into=line[42:].strip()
        records.append((variable,type_nm,data_into))

    popped_records = [records.pop(0) for _ in range(2)]
    sorted_records = sorted(records, key=lambda x: x[1])
    sorted_records = popped_records + sorted_records
    for record in sorted_records:
        print(f"{record[0]:<29}{record[1]:<12}{record[2]}")
        
sorted_whos(notebook_variables)

Variable                     Type        Data/Info
----------------------------------------------------
noteboo_variables            CapturedIO  Variable                 <...>31389 rows x 5 columns]\n
df                           DataFrame   id    <...>\n[3248 rows x 7 columns]
reference_data_full_df       DataFrame   id  <...>[164676 rows x 7 columns]
wormcat_df                   DataFrame   wc_sequence_id     <...>n[31389 rows x 5 columns]
category_dfs                 dict        n=34
dfs                          dict        n=32
reference_dfs                dict        n=32
current_wormbase_version     function    <function current_wormbas<...>ersion at 0x7f25b095adc0>
df_summary                   function    <function df_summary at 0x7f25a2a283a0>
ontology_to_csv              function    <function ontology_to_csv at 0x7f25b095a3a0>
print_memory_usage           function    <function print_memory_usage at 0x7f25a2a281f0>
refereneces_to_csv           function    <function refereneces_

In [85]:
wormbase_id = "WBGene00008288"
uuid = "999"
wormbase_api = WormbaseAPI("field", "gene", "gene_ontology_summary")
ret_data = wormbase_api.get_wormbase_data(wormbase_id)
pretty_data = json.dumps(ret_data, indent=4)
print(pretty_data)
with open(f"result_{uuid}.json", 'w') as file:
        file.write(pretty_data)


{
    "gene_ontology_summary": {
        "Cellular_component": [
            {
                "name": "centriole",
                "id": "GO:0005814"
            },
            {
                "name": "nucleus",
                "id": "GO:0005634"
            },
            {
                "name": "motile cilium",
                "id": "GO:0031514"
            },
            {
                "name": "cilium",
                "id": "GO:0005929"
            },
            {
                "name": "ciliary basal body",
                "id": "GO:0036064"
            },
            {
                "name": "microtubule",
                "id": "GO:0005874"
            },
            {
                "name": "cytoplasm",
                "id": "GO:0005737"
            },
            {
                "name": "axonemal microtubule",
                "id": "GO:0005879"
            },
            {
                "name": "cytoskeleton",
                "id": "GO:0005856"
            },
  