# Update WormCat Annotations

1. Find Changed Sequence IDs
2. Find Dead Genes
3. Find New Genes (Added Live)
4. Update Annotation text

# Pull Data from Wormbase
Pull the Gene IDs and from the current version of Wormbase and the latest version on Wormbase website
* Latest from Wormbase website WS289
* Current WormCat version used WS270

Also pull the functional gene descriptions from the latest version

In [1]:
%%bash
# Pull down data from Wormbase and unzip
INPUT_DATA="./input_data"
get_wormbase_data() {
    local WORMBASE_VERSION="$1"
    local FILE_ROOT="$2"
    local BASE_FTP="ftp://ftp.wormbase.org/pub/wormbase/releases"
    local SPECIES_DIR="species/c_elegans/PRJNA13758/annotation"
    local FILE_PREFIX="c_elegans.PRJNA13758"

    wget -nv -P ${INPUT_DATA} ${BASE_FTP}/${WORMBASE_VERSION}/${SPECIES_DIR}/${FILE_PREFIX}.${WORMBASE_VERSION}.${FILE_ROOT}
    gunzip -f ${INPUT_DATA}/${FILE_PREFIX}.${WORMBASE_VERSION}.${FILE_ROOT}
}

# Pull down geneIDs.txt
get_geneids() {
    local WORMBASE_VERSION="$1"
    get_wormbase_data $WORMBASE_VERSION "geneIDs.txt.gz"
}

# Pull down functional_descriptions.txt
get_functional_descriptions() {
    local WORMBASE_VERSION="$1"
    get_wormbase_data $WORMBASE_VERSION "functional_descriptions.txt.gz"
}

# Get GeneId data from the old version used for WormCat and the latest version on wormbase
get_geneids "WS289"
get_geneids "WS270"

# Get the get_functional_descriptions of genes for the lastest Wormbase version
get_functional_descriptions "WS289"



2023-08-28 11:32:24 URL: ftp://ftp.wormbase.org/pub/wormbase/releases/WS289/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WS289.geneIDs.txt.gz [416537] -> "./input_data/c_elegans.PRJNA13758.WS289.geneIDs.txt.gz" [1]
2023-08-28 11:32:24 URL: ftp://ftp.wormbase.org/pub/wormbase/releases/WS270/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WS270.geneIDs.txt.gz [407443] -> "./input_data/c_elegans.PRJNA13758.WS270.geneIDs.txt.gz" [1]
2023-08-28 11:32:25 URL: ftp://ftp.wormbase.org/pub/wormbase/releases/WS289/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WS289.functional_descriptions.txt.gz [2472804] -> "./input_data/c_elegans.PRJNA13758.WS289.functional_descriptions.txt.gz" [1]


### Convert the functional gene descriptions to CSV format

In [2]:
import pandas as pd
from convert_func_desc_to_csv import convert_functional_description_to_csv

from_file_nm = './input_data/c_elegans.PRJNA13758.WS289.functional_descriptions.txt'
to_file_nm = './output_data/functional_descriptions.csv'
convert_functional_description_to_csv(from_file_nm, to_file_nm)
functional_descriptions_df = pd.read_csv(to_file_nm, low_memory=False)
print(f"{len(functional_descriptions_df)=:,}")

len(functional_descriptions_df)=52,109


### Read the Gene IDs into a Dataframe

In [3]:
import pandas as pd
# Note: The raw data does not have columns assigned
INPUT_DATA='./input_data'

def read_geneids(wormbase_version, prefix_column_nm=True):
    FILE_PREFIX="c_elegans.PRJNA13758"
    FILE_ROOT="geneIDs.txt"

    gene_id_file=f'{INPUT_DATA}/{FILE_PREFIX}.{wormbase_version}.{FILE_ROOT}'
    columns = ["junk", "Wormbase_Id", "Gene_name", "Sequence_id", "Status", "Gene_Type"]
    if prefix_column_nm:
        columns = [f"{wormbase_version}_{column}" for column in columns]
    geneids_df = pd.read_csv(gene_id_file, low_memory=False, header=None, names=columns)
    geneids_df = geneids_df.drop(columns=columns[0])
    return geneids_df

wormbase_270_df = read_geneids("WS270")
wormbase_289_df = read_geneids("WS289")

print(f"{len(wormbase_270_df)=:,}")
print(f"{len(wormbase_289_df)=:,}")
print(f"{len(wormbase_289_df)-len(wormbase_270_df)=:,}")


len(wormbase_270_df)=51,254
len(wormbase_289_df)=52,109
len(wormbase_289_df)-len(wormbase_270_df)=855


In [4]:
column_names = wormbase_270_df.columns.tolist()
column_names

['WS270_Wormbase_Id',
 'WS270_Gene_name',
 'WS270_Sequence_id',
 'WS270_Status',
 'WS270_Gene_Type']

### Load the WormCat Annotation file

In [5]:
# Load the Current WormCat Annotation file
INPUT_DATA='./input_data'
WORMCAT_ANNOTATIONS='whole_genome_v2_nov-11-2021.csv'
wormcat_file = f"{INPUT_DATA}/{WORMCAT_ANNOTATIONS}"

wormcat_df = pd.read_csv(wormcat_file, low_memory=False)
print(f"{len(wormcat_df)=:,}")

len(wormcat_df)=31,389


### Join the Wormbase 270 Gene IDs with WormCat
Do some simple tests on the data to see that they all match

In [6]:
wormcat_270_df = wormcat_df.merge(wormbase_270_df, how='left', left_on='Wormbase ID', right_on='WS270_Wormbase_Id')

wormcat_df.rename(columns={'Wormbase ID': 'Wormbase_Id'}, inplace=True)
print(f"{len(wormcat_270_df)=:,}")
#wormcat_270_df

len(wormcat_270_df)=31,389


In [8]:
# We expect to find ONLY Live Genes
wormcat_270_df['WS270_Status'].value_counts()
# And that is exactly what we find!

Live    31389
Name: WS270_Status, dtype: int64

In [9]:
# We expect not to find any differences in sequence ids
wormcat_270_df['sequence_ids_equal'] = wormcat_270_df['Sequence ID'].str.upper().eq(wormcat_270_df['WS270_Sequence_id'].str.upper()).astype(int)
result = wormcat_270_df[wormcat_270_df['sequence_ids_equal'] == 0]
wormcat_270_df = wormcat_270_df.drop(columns=['sequence_ids_equal'])
result
# But we find we find one

Unnamed: 0,Sequence ID,Wormbase ID,Category 1,Category 2,Category 3,Automated Description,WS270_Wormbase_Id,WS270_Gene_name,WS270_Sequence_id,WS270_Status,WS270_Gene_Type,sequence_ids_equal
1994,K08E3.5,WBGene00010716,Metabolism,Metabolism: carbohydrate,Metabolism: carbohydrate,K08E3.5 is an ortholog of human UGP2\; is pred...,WBGene00010716,lge-1,K09C8.4,Live,protein_coding_gene,0


### Join WormCat with Wormbase 289 Gene Ids
This will give us the start of the New Annotation List


In [10]:
wormbase_289_wormcat_270_df = wormbase_289_df.merge(wormcat_270_df, how='left', left_on='WS289_Wormbase_Id', right_on='Wormbase ID')


In [11]:
#column_names = wormbase_289_wormcat_270_df.columns.tolist()
#column_names

#### How Many Current WormCat IDs have Died?

In [12]:
dead_ids_df = wormbase_289_wormcat_270_df[~wormbase_289_wormcat_270_df['WS289_Status'].isin(['Live'])]
wormbase_dead_ids_df = dead_ids_df[dead_ids_df['Wormbase ID'].notna()]
#pd.set_option('display.max_rows', None) 
pd.reset_option('display.max_rows')
print(f"{len(wormbase_dead_ids_df)=}")
#wormbase_dead_ids

len(wormbase_dead_ids_df)=20


#### How Many Current WormCat Sequence IDs have changed?

In [13]:
# We expect not to find any differences in sequence ids
wormbase_289_wormcat_270_df['sequence_ids_equal'] = wormbase_289_wormcat_270_df['Sequence ID'].str.upper().eq(wormbase_289_wormcat_270_df['WS289_Sequence_id'].str.upper()).astype(int)
result = wormbase_289_wormcat_270_df[(wormbase_289_wormcat_270_df['sequence_ids_equal'] == 0) & 
                                     (wormbase_289_wormcat_270_df['WS289_Status'] == 'Live') &
                                     wormbase_289_wormcat_270_df['Wormbase ID'].notna()]
wormbase_289_wormcat_270_df = wormbase_289_wormcat_270_df.drop(columns=['sequence_ids_equal'])
print(f"{len(result)} WormCat Sequence ID(s) have changed.") 
result

1 WormCat Sequence ID(s) have changed.


Unnamed: 0,WS289_Wormbase_Id,WS289_Gene_name,WS289_Sequence_id,WS289_Status,WS289_Gene_Type,Sequence ID,Wormbase ID,Category 1,Category 2,Category 3,Automated Description,WS270_Wormbase_Id,WS270_Gene_name,WS270_Sequence_id,WS270_Status,WS270_Gene_Type,sequence_ids_equal
10642,WBGene00010716,lge-1,K09C8.4,Live,protein_coding_gene,K08E3.5,WBGene00010716,Metabolism,Metabolism: carbohydrate,Metabolism: carbohydrate,K08E3.5 is an ortholog of human UGP2\; is pred...,WBGene00010716,lge-1,K09C8.4,Live,protein_coding_gene,0


### We only want Live genes in the Annotation List

In [14]:
wormbase_289_wormcat_270_live_df = wormbase_289_wormcat_270_df[wormbase_289_wormcat_270_df['WS289_Status'].isin(['Live'])]

In [15]:
wormbase_289_wormcat_270_live_df['WS289_Gene_Type'].value_counts()

protein_coding_gene      19984
piRNA_gene               15363
ncRNA_gene                8487
pseudogene                2131
gene                      1525
tRNA_gene                  634
snoRNA_gene                346
miRNA_gene                 261
lincRNA_gene               193
snRNA_gene                 129
antisense_lncRNA_gene      100
rRNA_gene                   22
scRNA_gene                   1
Name: WS289_Gene_Type, dtype: int64

#### We do not want genes of Type `piRNA_gene`, `gene`, or `transposable_element_gene` in the Annotation List

__Note__ From reviewing the data we see that all `transposable_element_gene` are also Dead Genes so they do not show up anyway

In [16]:
wormbase_289_wormcat_270_live_type_df = wormbase_289_wormcat_270_live_df[~wormbase_289_wormcat_270_live_df['WS289_Gene_Type'].isin(['piRNA_gene','gene', 'transposable_element_gene'])]
print(f"The New Annotation List will have {len(wormbase_289_wormcat_270_live_type_df):,} genes.")

The New Annotation List will have 32,288 genes.


In [17]:
wormbase_289_wormcat_270_live_type_df['WS289_Status'].value_counts()
net_new = len(wormbase_289_wormcat_270_live_type_df[wormbase_289_wormcat_270_live_type_df['Wormbase ID'].isna()])
print(f"The New Annotation List will have {net_new:,} new genes.")

The New Annotation List will have 919 new genes.


In [18]:
columns_to_drop = ['Wormbase ID', 'Sequence ID', 'WS289_Status', 
                   'WS270_Wormbase_Id', 'WS270_Gene_name', 'WS270_Sequence_id', 'WS270_Status', 'WS270_Gene_Type']

wormcat_new_df = wormbase_289_wormcat_270_live_type_df.drop(columns=columns_to_drop)
wormcat_new_df = wormcat_new_df.rename(columns={'WS289_Wormbase_Id': 'Wormbase ID', 'WS289_Sequence_id': 'Sequence ID'})
wormcat_new_df

Unnamed: 0,Wormbase ID,WS289_Gene_name,Sequence ID,WS289_Gene_Type,Category 1,Category 2,Category 3,Automated Description
0,WBGene00000001,aap-1,Y110A7A.10,protein_coding_gene,Signaling,Signaling: lipid,Signaling: lipid: kinase PI3,aap-1 is an ortholog of human PIK3R1 (phosphoi...
1,WBGene00000002,aat-1,F27C8.1,protein_coding_gene,Transmembrane transport,Transmembrane transport: amino acid,Transmembrane transport: amino acid,aat-1 is an ortholog of human SLC7A5 (solute c...
2,WBGene00000003,aat-2,F07C3.7,protein_coding_gene,Transmembrane transport,Transmembrane transport: amino acid,Transmembrane transport: amino acid,aat-2 is an ortholog of human SLC7A11 (solute ...
3,WBGene00000004,aat-3,F52H2.2,protein_coding_gene,Transmembrane transport,Transmembrane transport: amino acid,Transmembrane transport: amino acid,"aat-3 is an ortholog of human SLC7A5, SLC7A6 (..."
4,WBGene00000005,aat-4,T13A10.10,protein_coding_gene,Transmembrane transport,Transmembrane transport: amino acid,Transmembrane transport: amino acid,aat-4 is an ortholog of human SLC7A11 (solute ...
...,...,...,...,...,...,...,...,...
52104,WBGene00306126,cone-1,Y54F10AM.16,protein_coding_gene,,,,
52105,WBGene00306131,,Y34B4A.20,protein_coding_gene,,,,
52106,WBGene00306132,,F54D10.10,protein_coding_gene,,,,
52107,WBGene00306133,azyx-1,F42G4.11,protein_coding_gene,,,,


In [19]:
columns_to_drop = ['Wormbase_Id', 'gene_nm', 'sequence_id', 'concise_description', 'gene_class_description']
wormcat_new_desc_df = wormcat_new_df.merge(functional_descriptions_df, how='left', left_on='Wormbase ID', right_on='Wormbase_Id')
wormcat_new_desc_df = wormcat_new_desc_df.drop(columns=columns_to_drop)

wormcat_new_desc_df

Unnamed: 0,Wormbase ID,WS289_Gene_name,Sequence ID,WS289_Gene_Type,Category 1,Category 2,Category 3,Automated Description,automated_description
0,WBGene00000001,aap-1,Y110A7A.10,protein_coding_gene,Signaling,Signaling: lipid,Signaling: lipid: kinase PI3,aap-1 is an ortholog of human PIK3R1 (phosphoi...,Enables protein kinase binding activity. Invol...
1,WBGene00000002,aat-1,F27C8.1,protein_coding_gene,Transmembrane transport,Transmembrane transport: amino acid,Transmembrane transport: amino acid,aat-1 is an ortholog of human SLC7A5 (solute c...,Contributes to L-amino acid transmembrane tran...
2,WBGene00000003,aat-2,F07C3.7,protein_coding_gene,Transmembrane transport,Transmembrane transport: amino acid,Transmembrane transport: amino acid,aat-2 is an ortholog of human SLC7A11 (solute ...,Predicted to enable L-amino acid transmembrane...
3,WBGene00000004,aat-3,F52H2.2,protein_coding_gene,Transmembrane transport,Transmembrane transport: amino acid,Transmembrane transport: amino acid,"aat-3 is an ortholog of human SLC7A5, SLC7A6 (...",Contributes to L-amino acid transmembrane tran...
4,WBGene00000005,aat-4,T13A10.10,protein_coding_gene,Transmembrane transport,Transmembrane transport: amino acid,Transmembrane transport: amino acid,aat-4 is an ortholog of human SLC7A11 (solute ...,Predicted to enable L-amino acid transmembrane...
...,...,...,...,...,...,...,...,...,...
32283,WBGene00306126,cone-1,Y54F10AM.16,protein_coding_gene,,,,,Predicted to be involved in intra-Golgi vesicl...
32284,WBGene00306131,,Y34B4A.20,protein_coding_gene,,,,,none available
32285,WBGene00306132,,F54D10.10,protein_coding_gene,,,,,Is affected by several genes including dpy-10;...
32286,WBGene00306133,azyx-1,F42G4.11,protein_coding_gene,,,,,Is affected by sir-2.1 based on microarray stu...


In [20]:
column_names = wormcat_new_desc_df.columns.tolist()
column_names

['Wormbase ID',
 'WS289_Gene_name',
 'Sequence ID',
 'WS289_Gene_Type',
 'Category 1',
 'Category 2',
 'Category 3',
 'Automated Description',
 'automated_description']

In [21]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained model
model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

# Define the check_similarity function
def check_similarity(text1, text2):
    if pd.isna(text1) or pd.isna(text2):
        return 0.0  # Return 0 if either text1 or text2 is NaN
    
    embedding1 = model.encode(text1, convert_to_tensor=True)
    embedding2 = model.encode(text2, convert_to_tensor=True)
    similarity_score = util.pytorch_cos_sim(embedding1, embedding2)[0][0].item()
    return similarity_score


wormcat_new_desc_df['similarity_score'] = wormcat_new_desc_df.apply(lambda row: check_similarity(row['Automated Description'], row['automated_description']), axis=1)

In [22]:
desired_column_order = ['Wormbase ID', 'Sequence ID', 'Category 1', 'Category 2', 'Category 3',
 'Automated Description', 'automated_description', 'similarity_score',
 'WS289_Gene_name','WS289_Gene_Type']
wormcat_new_desc_df = wormcat_new_desc_df[desired_column_order]
wormcat_new_desc_df = wormcat_new_desc_df.rename(columns={'WS289_Gene_name': 'Gene_name', 'WS289_Gene_Type': 'Gene_Type'})

In [23]:
wormcat_new_desc_df.to_csv('./output_data/whole_genome_v3_aug-25-2023.csv', index=False)