# Wormbase Phenotype Data

Collect Phentotype data from Wormbase

__Notes:__

* We define Poorly Annotated Genes (PAG) as Genes in Category 1 of WormCat with the descriptions ['Unassigned','Transmembrane protein', 'Transmembrane transport']

In [21]:
import numpy as np
import pandas as pd
import math
import requests
import json
import csv
import time
import os
from datetime import datetime

OUTPUT_DATA='./output_data/phenotype_output'

# Get Phenotype Data

For the following:
* Unassigned
* Transcription_factor
* Neuronal_function

In [22]:
# This is what our output record will look like
class phenotype_record:
    wormbase_id     = 0
    phenotype_id    = 1
    phenotype_label = 2 
    evidence_type   = 3
    evidence_id     = 4
    evidence_label  = 5 
    evidence_class  = 6
    header = ['wormbase_id', 'phenotype_id', 'phenotype_label', 'evidence_type', 'evidence_id', 'evidence_label', 'evidence_class']
    empty = [None, None, None, None, None, None, None]

# Very simple HTTP call    
def call_wormbase(wormbase_id):
    api_url = f'http://rest.wormbase.org/rest/field/gene/{wormbase_id}/phenotype'
    # Absolutley no error checking is done!!
    response = requests.get(api_url)
    json_data = json.loads(response.text)
    return response.status_code, json_data

# Process the results
def process_response(status_code, json_data):
    ret_val = None
    if status_code == 200:
        ret_val = phenotype_json_to_dataframe(json_data)
    else:
        print(f"Error code {status_code}")
        if reason in json_data:
            print(json_data['reason'])
    return ret_val

# Flatten the JSON response into a set of records
def phenotype_json_to_dataframe(json_data):
    records = []
    phenotype_data = json_data["phenotype"]['data']
    if phenotype_data == None:
        record = phenotype_record.empty.copy()
        record[phenotype_record.wormbase_id] = json_data["name"]
        records.append(record)
        print(f'No phenotype data for {record[phenotype_record.wormbase_id]}')
    else:
        #print(len(phenotype_data))
        for index, activity in enumerate(phenotype_data):
            record = phenotype_record.empty.copy()
            evidence = None
            record[phenotype_record.wormbase_id] = json_data["name"]
            #print(f"{index=} {'='*80}")
            if 'phenotype' in activity:
                record[phenotype_record.phenotype_id] = activity['phenotype']['id']
                record[phenotype_record.phenotype_label] = activity['phenotype']['label']
            if 'evidence' in activity:
                if 'Allele' in activity['evidence']:
                    evidence = 'Allele'
                elif 'RNAi' in activity['evidence']:
                    evidence = 'RNAi'
                    
                if evidence:
                    record[phenotype_record.evidence_type] = evidence
                    record[phenotype_record.evidence_id] = activity['evidence'][evidence]['text']['id']
                    record[phenotype_record.evidence_label] = activity['evidence'][evidence]['text']['label']
                    record[phenotype_record.evidence_class] = activity['evidence'][evidence]['text']['class']
                    
            records.append(record)
            
    return records
    
# write the records to a file 
def write_records(filename, records):
    write_type='w'
    if os.path.isfile(filename):
        write_type='a'

    with open(filename, write_type, newline='') as file:
        writer = csv.writer(file)
        if(write_type == 'w'):
            writer.writerows([phenotype_record.header])
        
        writer.writerows(records)


In [23]:

# iterate a collection of wormbase ids
def process_all(genes_to_evaluate_df, filename):
    
    if os.path.isfile(filename):
        os.remove(filename)

    df_len = len(genes_to_evaluate_df)
    index_pos=0
    for index, row in genes_to_evaluate_df.iterrows():
        try:
            print(f"processing {row[0]} {index_pos+1:04} of {df_len}")
            index_pos +=1
            wormbase_id = row[0]
            status_code, json_data = call_wormbase(wormbase_id)
            records = process_response(status_code, json_data)
            if records:
                #print(records)
                write_records(filename, records)
            time.sleep(.25)
        except Exception as e:
            print(f"An error occurred: {e}")
            #print(json.dumps(json_data,indent=4))
        


In [None]:
# Load the wormbase category list
# Setup. to be queried
wormcat_df = pd.read_csv('./input_data/whole_genome_v2_nov-11-2021.csv') 
wormcat_df = wormcat_df.rename(columns={'Sequence ID':'sequence_id','Wormbase ID':'wormbase_id','Category 1':'category_1','Category 2':'category_2','Category 3':'category_3'})
wormcat_df.columns

In [None]:
# Query the catalog for Neuronal function and look up phenotype data
unassigned = wormcat_df.query("category_1 == 'Unassigned'")['wormbase_id']
unassigned_df = unassigned.to_frame()
print(f'{len(unassigned)=}')

filename = 'unassigned_phenotypes.csv'

#process_all(unassigned_df,filename)

In [None]:
# Query the catalog for Neuronal function and look up phenotype data
neuronal_function = wormcat_df.query("category_1 == 'Neuronal function'")['wormbase_id']
neuronal_function_df = neuronal_function.to_frame()
len(neuronal_function)

filename = 'neuronal_function_phenotypes.csv'

#process_all(neuronal_function_df,filename)

In [None]:
# Query the catalog for Transcription factor and look up phenotype data
transcription_factor = wormcat_df.query("category_1 == 'Transcription factor'")['wormbase_id']
transcription_factor_df = transcription_factor.to_frame()
len(transcription_factor_df)

filename = 'transcription_factor_phenotypes.csv'

#process_all(transcription_factor_df,filename)

In [None]:
# Query the catalog for Transcription factor and look up phenotype data
unassigned = wormcat_df.query("category_1 == 'Unassigned'")['wormbase_id']
unassigned_df = unassigned.to_frame()
print(f' {len(unassigned_df)=}')

filename = 'unassigned_phenotypes.csv'

#process_all(transcription_factor_df,filename)

# Get the Unique Phenotypes

In [None]:
# Load the raw phenotype data info for the wormcat category
# find all the unique phenotype labels
# write back to a file

filenames = ['unassigned_phenotypes.csv','transcription_factor_phenotypes.csv','neuronal_function_phenotypes.csv']
for filename in filenames:
    phenotypes_df = pd.read_csv(f'{OUTPUT_DATA}/{filename}') 
    phenotype_labels = phenotypes_df['phenotype_label'].value_counts()
    phenotype_labels = phenotype_labels.to_frame()
    phenotype_labels = phenotype_labels.reset_index()
    phenotype_labels = phenotype_labels.rename(columns={'index':'phenotype_label','phenotype_label':'count'})
    phenotype_labels.to_csv(f'{OUTPUT_DATA}/{filename[:-4]}_labels.csv', index=False)
    print(f'Process {OUTPUT_DATA}/{filename[:-4]}_labels.csv')
    phenotype_labels

In [None]:
# Load the phenotype data and unique labels for the wormcat categories ['unassigned','transcription_factor','neuronal_function']


class phenotype:
    unassigned = 0
    transcription_factor = 1
    neuronal_function = 2
    categories = ['unassigned','transcription_factor','neuronal_function']
    

phenotypes = [None] * len(phenotype.categories)
phenotypes_labels = [None] * len(phenotype.categories)

for index, category in enumerate(phenotype.categories):
    phenotypes[index] = pd.read_csv(f'{OUTPUT_DATA}/{phenotype.categories[index]}_phenotypes.csv') 
    phenotypes_labels[index] = pd.read_csv(f'{OUTPUT_DATA}/{phenotype.categories[index]}_phenotypes_labels.csv') 
 

In [None]:
unassigned_df = phenotypes[phenotype.unassigned]
unassigned_df

In [None]:

for index, category in enumerate(phenotype.categories):
    print(f'Number of categories for {category} = {len(phenotypes[index]):,d}')
    phenotypes_labels[index]['percent_of_all']= phenotypes_labels[index]['count']/len(phenotypes[index])
    for index1, category1 in enumerate(phenotype.categories):
        if index1 != index:
            phenotypes_labels[index][f'in_{phenotype.categories[index1]}'] = phenotypes_labels[index].apply(
                lambda row: row.phenotype_label in list(phenotypes_labels[index1]['phenotype_label']), axis=1)
    

In [None]:
# Create and excel file making one sheet per phenotype category ['unassigned','transcription_factor','neuronal_function']
for index, category in enumerate(phenotype.categories):
    print(f'Save {category}')
    mode = 'w' if index==0 else 'a'
    with pd.ExcelWriter(f'{OUTPUT_DATA}/wormcat_phenotype_summary.xlsx', mode=mode) as writer:
        phenotypes_labels[index].to_excel(writer, sheet_name=category.title(), index=False)


In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn3

venn3_data = [ set(phenotypes_labels[i]['phenotype_label']) for i in range(3)]
venn3_labels = [ phenotype.categories[i].title() for i in range(3)]
venn3(venn3_data, set_labels=venn3_labels)

# Add titles and labels
plt.title('Phenotype Venn Diagram')


file_name='phenotype_venn_diagram.png'
plt.savefig(f'{OUTPUT_DATA}/{file_name}')
plt.show()

# Essential Genes

* lethal, arrest
* sterile
* lipid

In [None]:
unassigned_df = phenotypes[phenotype.unassigned]
unassigned_df = unassigned_df.dropna(subset=['phenotype_label'])

unassigned_lethal_df = unassigned_df[unassigned_df['phenotype_label'].str.contains('lethal|arrest')].copy()
unassigned_lethal_df['essential']='lethal'
print(f'{len(unassigned_lethal_df)=}')

unassigned_sterile_df = unassigned_df[unassigned_df['phenotype_label'].str.contains('sterile')].copy()
unassigned_sterile_df['essential']='sterile'
print(f'{len(unassigned_sterile_df)=}')

unassigned_lipid_df = unassigned_df[unassigned_df['phenotype_label'].str.contains('lipid')].copy()
unassigned_lipid_df['essential']='lipid'
print(f'{len(unassigned_lipid_df)=}')

essential_df = pd.concat([unassigned_lethal_df, unassigned_sterile_df, unassigned_lipid_df], axis=0)
essential_df



In [None]:
grouped_wormbase_id = essential_df.groupby('wormbase_id')
essential_dedup_df = grouped_wormbase_id['essential'].count()
essential_dedup_df

In [None]:
essential_df.columns


In [None]:
essential1_df = essential_df.drop(columns=['phenotype_id', 'evidence_type',
       'evidence_id', 'evidence_label', 'evidence_class'])
essential_dedup = {}
phenotype_label = {'lipid':set(),'lethal':set(),'sterile':set()}

for index, row in essential1_df.iterrows():
    if row[0] in essential_dedup:
        essential = essential_dedup[row[0]]
        essential[row[2]] +=1
        phenotype_label[row[2]].add(row[1])
    else:
        essential = {'lipid':0,'lethal':0,'sterile':0}
        essential[row[2]] +=1
        phenotype_label[row[2]].add(row[1])
        essential_dedup[row[0]] = essential
        
#essential_dedup
phenotype_label = {'lipid':list(phenotype_label['lipid']),'lethal':list(phenotype_label['lethal']),'sterile':list(phenotype_label['sterile'])}
print(json.dumps(phenotype_label, indent=4))


In [None]:
dataframe_dict = {}
dataframe_dict['wormbase_id']=[]
dataframe_dict['lipid']=[]
dataframe_dict['lethal']=[]
dataframe_dict['sterile']=[]

for key in essential_dedup.keys():
    essential = essential_dedup[key]
    dataframe_dict['wormbase_id'].append(key)
    dataframe_dict['lipid'].append(essential['lipid'])
    dataframe_dict['lethal'].append(essential['lethal'])
    dataframe_dict['sterile'].append(essential['sterile'])
    
essential_dedup_df = pd.DataFrame(dataframe_dict)
essential_dedup_df.to_csv(f'{OUTPUT_DATA}/essential_pags.csv',index=False)

In [None]:
print(f"lipid   {len(essential_dedup_df[essential_dedup_df['lipid'] > 0])}")
print(f"lethal  {len(essential_dedup_df[essential_dedup_df['lethal'] > 0])}")
print(f"sterile {len(essential_dedup_df[essential_dedup_df['sterile'] > 0])}")

In [None]:
round(672/6343,4)*100

# All Protein Coding Genes

In [16]:
# Load all genes
# Define the column names as a list
column_names = ['Code_1', 'Wormbase_id', 'Code_2','Gene_id','id_status','gene_type']
gene_ids_df = pd.read_csv('./input_data/c_elegans.PRJNA13758.WS287.geneIDs.txt',header=None, names=column_names) 
gene_ids_df = gene_ids_df.query("id_status == 'Live'")
gene_ids_df = gene_ids_df.query("gene_type == 'protein_coding_gene'")
len(gene_ids_df)


19985

In [None]:
wormbase_ids = gene_ids_df['Wormbase_id']
wormbase_ids_df = wormbase_ids.to_frame()
print(f'{len(wormbase_ids_df)=}')

filename = 'protein_coding_phenotypes.csv'

process_all(wormbase_ids_df,filename)

# Appendix

In [None]:
# Drop the 'Automated Description' filed to save space when checking into git hub.
# Also speed up load time of this file
import pandas as pd
# Load the wormbase category list
# Setup. to be queried
wormcat_df = pd.read_csv('./input_data/whole_genome_v2_nov-11-2021.csv') 
wormcat_df = wormcat_df.rename(columns={'Sequence ID':'sequence_id','Wormbase ID':'wormbase_id','Category 1':'category_1','Category 2':'category_2','Category 3':'category_3'})
wormcat_df.columns

In [None]:
delete_column='Automated Description'
if delete_column in wormcat_df.columns:
    wormcat_df = wormcat_df.drop(columns=[delete_column])
wormcat_df

In [None]:
wormcat_df.to_csv('./input_data/whole_genome_v2_nov-11-2021.csv', index=False)