# Analyze the current Wormcat Annotation List with the Current Wormbase DB

1. Find any genes that are on the Wormcat Annotation List but are marked as Dead (Not Live). 
2. Find any genes that are marked as Live and 'protein coding' but are not on the Wormcat Annotation List. 
3. Find any genes that have changed their sequence_id."

Create: `wormcat_genes_for_review.csv`

In [18]:
import numpy as np
import pandas as pd
import math
import requests
import json
import csv
import time
import os
from datetime import datetime

In [17]:
%%bash
# Pull Wormcat Category List from source location as needed
cd /media/data1/Code/Notebooks/UMass_Med/unknown_genes/input_data

WORMCAT_DB="whole_genome_v2_nov-11-2021.csv"
WORMCAT_REPO=" http://www.wormcat.com/static/download"

if [ -f "$WORMCAT_DB" ]; then
   echo "File exists and will not be pulled."
else
   echo "File does not exist locally pulling from repo."
   wget -q ${WORMCAT_REPO}/${WORMCAT_DB}
fi


File exists and will not be pulled.


In [60]:
# Get the most current Wormbase DB
def current_wormbase_version():
    api_url = f'http://rest.wormbase.org//rest/database/version'
    # Absolutley no error checking is done!!
    response = requests.get(api_url)
    json_data = json.loads(response.text)
    if  response.status_code == 200:
        return json_data['data']
    else:
        return {'error':'somthing is not right'}
    
current_wormbase_version()

'WS291'

In [62]:
%%bash
# Pull Wormbase data from source location

cd /media/data1/Code/Notebooks/UMass_Med/unknown_genes/input_data

#### UPDATE WORMBASE DB ####
wormbase_version='WS291'

echo "Starting Wormbase Download with version $wormbase_version"

base_url="ftp://ftp.wormbase.org/pub/wormbase/releases/${wormbase_version}/species/c_elegans/PRJNA13758"
gene_ids="c_elegans.PRJNA13758.${wormbase_version}.geneIDs.txt.gz"

wget -q ${base_url}/annotation/${gene_ids}
gunzip --force ${gene_ids}

# Create GeneIDs.csv
gene_ids_txt=$(echo "$gene_ids" | sed 's/.\{3\}$//') # remove .gz
gene_ids_csv=$(echo "$gene_ids" | sed 's/.\{7\}$//') # remove .txt.gz
gene_ids_csv="${gene_ids_csv}.csv"                   # add .csv

# Drop the first column and Only include Live genes
awk -F',' '$5=="Live" {print $2","$3","$4","$6}' "$gene_ids_txt" > "$gene_ids_csv"
# Add Header line  
sed -i '1iWormbase_Id,Gene_name,Sequence_id,Gene_Type' "$gene_ids_csv"
echo created $gene_ids_csv

Starting Wormbase Download with version WS291
created c_elegans.PRJNA13758.WS291.geneIDs.csv


In [66]:
# Load the Wormbase Annotation List

wormcat_df = pd.read_csv('./input_data/whole_genome_v2_nov-11-2021.csv') 
wormcat_df = wormcat_df.rename(columns={'Sequence ID':'wc_sequence_id','Wormbase ID':'wormbase_id','Category 1':'category_1','Category 2':'category_2','Category 3':'category_3'})
wormcat_df = wormcat_df.drop(columns=['Automated Description'])
print(len(wormcat_df))
wormcat_df.head()

31389


Unnamed: 0,wc_sequence_id,wormbase_id,category_1,category_2,category_3
0,F15H10.3,WBGene00000144,Cell cycle,Cell cycle: APC,Cell cycle: APC
1,F35G12.9,WBGene00000145,Cell cycle,Cell cycle: APC,Cell cycle: APC
2,C09H10.7,WBGene00007501,Cell cycle,Cell cycle: APC,Cell cycle: APC
3,K06H7.6,WBGene00000143,Cell cycle,Cell cycle: APC,Cell cycle: APC
4,B0511.9,WBGene00015235,Cell cycle,Cell cycle: APC,Cell cycle: APC


In [69]:
# Load the wormbase category list

gene_ids_df = pd.read_csv('./input_data/c_elegans.PRJNA13758.WS291.geneIDs.txt') 
#gene_ids_df = wormcat_df.rename(columns={'Sequence ID':'sequence_id','Wormbase ID':'wormbase_id','Category 1':'category_1','Category 2':'category_2','Category 3':'category_3'})
gene_ids_df.columns = ['code','wormbase_id','gene_name','wb_sequence_id','status','coding']
gene_ids_df = gene_ids_df.drop(columns=['code'])
gene_ids_df.head()

Unnamed: 0,wormbase_id,gene_name,wb_sequence_id,status,coding
0,WBGene00000002,aat-1,F27C8.1,Live,protein_coding_gene
1,WBGene00000003,aat-2,F07C3.7,Live,protein_coding_gene
2,WBGene00000004,aat-3,F52H2.2,Live,protein_coding_gene
3,WBGene00000005,aat-4,T13A10.10,Live,protein_coding_gene
4,WBGene00000006,aat-5,C55C2.5,Live,protein_coding_gene


In [81]:
# Join the two dataframes

merged_df = pd.merge(gene_ids_df, wormcat_df, on='wormbase_id', how='left')
merged_df.head()
#merged_df.to_csv('merged.csv',index=False)

Unnamed: 0,wormbase_id,gene_name,wb_sequence_id,status,coding,wc_sequence_id,category_1,category_2,category_3
0,WBGene00000002,aat-1,F27C8.1,Live,protein_coding_gene,F27C8.1,Transmembrane transport,Transmembrane transport: amino acid,Transmembrane transport: amino acid
1,WBGene00000003,aat-2,F07C3.7,Live,protein_coding_gene,F07C3.7,Transmembrane transport,Transmembrane transport: amino acid,Transmembrane transport: amino acid
2,WBGene00000004,aat-3,F52H2.2,Live,protein_coding_gene,F52H2.2,Transmembrane transport,Transmembrane transport: amino acid,Transmembrane transport: amino acid
3,WBGene00000005,aat-4,T13A10.10,Live,protein_coding_gene,T13A10.10,Transmembrane transport,Transmembrane transport: amino acid,Transmembrane transport: amino acid
4,WBGene00000006,aat-5,C55C2.5,Live,protein_coding_gene,C55C2.5,Transmembrane transport,Transmembrane transport: amino acid,Transmembrane transport: amino acid


In [82]:
# Find any Genes that are on the Wormcat Annotation List but are marked as Dead (Not Live)
dead_genes_rows = merged_df[(merged_df['status'] != 'Live') & merged_df['category_1'].notna()]
dead_genes_rows = dead_genes_rows.copy()
dead_genes_rows['change_code']='Dead Genes'
print(len(dead_genes_rows))
dead_genes_rows.head()

20


Unnamed: 0,wormbase_id,gene_name,wb_sequence_id,status,coding,wc_sequence_id,category_1,category_2,category_3,change_code
7426,WBGene00007501,,C09H10.7,Dead,gene,C09H10.7,Cell cycle,Cell cycle: APC,Cell cycle: APC,Dead Genes
7855,WBGene00007930,,C34D1.2,Dead,gene,C34D1.2,Transcription factor,Transcription factor: ZF,Transcription factor: ZF,Dead Genes
10089,WBGene00010164,,F56H6.3,Dead,gene,F56H6.3,Unassigned,Unassigned,Unassigned,Dead Genes
14417,WBGene00014492,,R166.6,Dead,gene,R166.6,Non-coding RNA,Non-coding RNA: ncRNA,Non-coding RNA: ncRNA,Dead Genes
16267,WBGene00016342,,,Dead,gene,C33C12.10,Unassigned,Unassigned,Unassigned: regulated by multiple stresses,Dead Genes


In [83]:
# Find any Genes that are marked as Live and "protien coding" but are not on the Wormcat Annotation List
not_annotated_rows = merged_df[(merged_df['coding'] == 'protein_coding_gene') & (merged_df['status'] == 'Live') & (merged_df['category_1'].isna())]
not_annotated_rows = not_annotated_rows.copy()
not_annotated_rows['change_code']='Not Annotated'
print(len(not_annotated_rows))
not_annotated_rows.head()

154


Unnamed: 0,wormbase_id,gene_name,wb_sequence_id,status,coding,wc_sequence_id,category_1,category_2,category_3,change_code
466,WBGene00000528,clh-1,T27D12.2,Live,protein_coding_gene,,,,,Not Annotated
792,WBGene00000859,cwp-1,C37H5.10,Live,protein_coding_gene,,,,,Not Annotated
1985,WBGene00002052,ifa-4,K05B2.3,Live,protein_coding_gene,,,,,Not Annotated
2910,WBGene00002977,lev-10,Y105E8A.7,Live,protein_coding_gene,,,,,Not Annotated
3535,WBGene00003602,nhr-3,H01A20.1,Live,protein_coding_gene,,,,,Not Annotated


In [84]:
# Find any Genes that have changed there sequence_id
changed_seq_id_rows = merged_df[(merged_df['wb_sequence_id'].str.lower() != merged_df['wc_sequence_id'].str.lower()) & (merged_df['status'] == 'Live') &merged_df['category_1'].notna()]
changed_seq_id_rows = changed_seq_id_rows.copy()
changed_seq_id_rows['change_code']='Sequence Id Changed'
print(len(changed_seq_id_rows))
changed_seq_id_rows.head()


1


Unnamed: 0,wormbase_id,gene_name,wb_sequence_id,status,coding,wc_sequence_id,category_1,category_2,category_3,change_code
10641,WBGene00010716,lge-1,K09C8.4,Live,protein_coding_gene,K08E3.5,Metabolism,Metabolism: carbohydrate,Metabolism: carbohydrate,Sequence Id Changed


In [85]:
concatenated_df = pd.concat([dead_genes_rows, changed_seq_id_rows, not_annotated_rows], ignore_index=True)
column_order = ['wormbase_id', 'change_code', 'status', 'coding', 'gene_name', 'wb_sequence_id', 'wc_sequence_id', 'category_1', 'category_2', 'category_3']
concatenated_df = concatenated_df.reindex(columns=column_order)
concatenated_df.to_csv('wormcat_genes_for_review.csv',index=False)