## Extract phenotype data from Wormbase

In [2]:
import csv
import time

# Track the time to make a function call
def formatted_elapsed_time(start,end=None):
    minute=60
    hour  =60 * minute

    if end == None:
        end = time.time()
    total_seconds = end - start
    hours = total_seconds // hour
    minutes = (total_seconds % hour) // minute
    seconds = (total_seconds % hour) % minute
    return f'Time: {hours=} {minutes=} {seconds=:.2f}'

# Given a phenotype_dict and a phenotype_list add only new (uniquie) phenotypes from the list to the dictionary
def unique_phenotypes(phenotype_dict, phenotype_list):
    for item in phenotype_list:
        wbpt_id = item['wbpt_id']
        wbpt_name = item['wbpt_name']
        
        evidence_allele = item.get('evidence_allele',None)
        evidence_rnai = item.get('evidence_rnai',None)
        evidence = ""
        if evidence_allele:
            evidence = 'allele'
        elif evidence_rnai:
            evidence = evidence_rnai
            
        if wbpt_id not in phenotype_dict:
            phenotype_dict[wbpt_id] = {'name':wbpt_name,'evidence':evidence}
    return phenotype_dict

# Given a phenotype_list return a dictionary with unique phenotypes and a count of 
# the occurrences of that phenotype in the list
def count_phenotypes(phenotype_list):
    phenotype_counts = {}
    for phenotype in phenotype_list:
        wbpt_id = phenotype['wbpt_id']
        if wbpt_id in phenotype_counts:
            phenotype_counts[wbpt_id] += 1
        else:
            phenotype_counts[wbpt_id] = 1
            
    return phenotype_counts

# Write the list of uniques phenotypes to a file
def phenotypes_to_csv(phenotype_dict, filename):
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["ID", "Name","Evidence"])  # header
        for wbpt_id, wbpt_data in sorted(phenotype_dict.items()):
            name = wbpt_data.get('name','')
            evidence = wbpt_data.get('evidence','')
            writer.writerow([wbpt_id, name, evidence])

# Write the phenotypes for each gene to a file
def gene_phenotypes_to_csv(gene_phenotypes, filename):
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["wormbase_id", "phenotype", "count"])  # header
        for gene, phenotypes in sorted(gene_phenotypes.items()):
            for phenotype, count in phenotypes.items():
                writer.writerow([gene, phenotype, count])



In [1]:
import pandas as pd
gene_ids_df = pd.read_csv('./wormbase_data/c_elegans.PRJNA13758.WS293.geneIDs.csv') 

# Extract only the protien coding genes
protein_coding_genes_df = gene_ids_df[gene_ids_df["Gene_Type"].isin(["protein_coding_gene", "gene"])]

# Expect we have 21,506 genes to process
assert len(protein_coding_genes_df) == 21_506


In [4]:
from pub_worm.wormbase.wormbase_api import WormbaseAPI

start_time = time.time()
# Set the API Class to get phenotype data from Worm Base
wormbase_api = WormbaseAPI("field", "gene", "phenotype")

genes_to_proccess = protein_coding_genes_df['Wormbase_Id'].tolist()
# This is a Multi-process call using 10 CPUs

wormbase_data_results = wormbase_api.get_wormbase_data_cpu(genes_to_proccess, 10)
print(formatted_elapsed_time(start_time))

gene_phenotypes = {}
phenotype_dict = {}
count=0
for wormbase_data_result in wormbase_data_results:
    wormbase_id, phenotype_list_dict = next(iter(wormbase_data_result.items()))    
    # if count % 100 == 0:
    #     print(f"count {count+1:>5} wormbase_id {wormbase_id}")
    # count +=1
    #If we has a phenotype_list we process it if we have an empty dict we skip
    if 'phenotype_list' in phenotype_list_dict:
        # If phenotype_list_dict['phenotype_list'] is a dict it means we only have 1 result
        # but we will wrap it in a list to make processing easier
        if isinstance(phenotype_list_dict['phenotype_list'], dict):
            phenotype_list = [phenotype_list_dict['phenotype_list']]
        else:
            phenotype_list = phenotype_list_dict['phenotype_list']
            
        phenotype_dict = unique_phenotypes(phenotype_dict, phenotype_list)
        phenotype_counts_dict = count_phenotypes(phenotype_list)
        gene_phenotypes[wormbase_id]= phenotype_counts_dict

    
phenotypes_to_csv(phenotype_dict, "wormbase_data/phenotypes.csv")
gene_phenotypes_to_csv(gene_phenotypes, "wormbase_data/gene_phenotypes.csv")


Check if you have a connection!! | Retry- 1 | Response msg- <urlopen error [Errno 60] Operation timed out>
Check if you have a connection!! | Retry- 1 | Response msg- <urlopen error [Errno 60] Operation timed out>
Time: hours=0.0 minutes=5.0 seconds=23.53


In [6]:
phenotypes_df = pd.read_csv('./papers/predicting_gene_essentiality/data/lethal_phenotypes.csv') 
phenotypes_df = phenotypes_df.sort_values(by='ID', na_position='last')
phenotypes_df

Unnamed: 0,Name,ID
71,one cell arrest early emb,WBPhenotype:0000040
61,egg size defective early emb,WBPhenotype:0000044
2,embryonic lethal,WBPhenotype:0000050
133,maternal effect lethal emb,WBPhenotype:0000052
134,paralyzed arrested elongation two fold,WBPhenotype:0000053
...,...,...
43,cleavage furrow initiation defective early emb,WBPhenotype:0001885
6,cleavage furrow initiation defective early emb,WBPhenotype:0001885
45,cleavage furrow termination defective early emb,WBPhenotype:0001886
8,cleavage furrow termination defective early emb,WBPhenotype:0001886


# Appendix

In [None]:
!pip install --upgrade  pub_worm