# Add wormbase IDs to a gene set

In [1]:
import os
import time
from datetime import datetime
import math
import requests
import json
import csv
import psutil


In [2]:
# Get the most current Wormbase DB
def current_wormbase_version():
    api_url = f'http://rest.wormbase.org//rest/database/version'
    # Absolutley no error checking is done!!
    response = requests.get(api_url)
    json_data = json.loads(response.text)
    if  response.status_code == 200:
        return json_data['data']
    else:
        return {'error':'something is not right'}
    
current_wormbase_version()

'WS291'

In [3]:
%%bash
# Pull down data from Wormbase and unzip
INPUT_DATA="./input_data"

get_wormbase_data() {
    local WORMBASE_VERSION="$1"
    local FILE_ROOT="$2"
    local BASE_FTP="ftp://ftp.wormbase.org/pub/wormbase/releases"
    local SPECIES_DIR="species/c_elegans/PRJNA13758/annotation"
    local FILE_PREFIX="c_elegans.PRJNA13758"

    wget -nv -P ${INPUT_DATA} ${BASE_FTP}/${WORMBASE_VERSION}/${SPECIES_DIR}/${FILE_PREFIX}.${WORMBASE_VERSION}.${FILE_ROOT}
    gunzip -f ${INPUT_DATA}/${FILE_PREFIX}.${WORMBASE_VERSION}.${FILE_ROOT}
}

# Pull down geneIDs.txt
get_geneids() {
    local WORMBASE_VERSION="$1"
    get_wormbase_data $WORMBASE_VERSION "geneIDs.txt.gz"
}

# Pull down functional_descriptions.txt
get_functional_descriptions() {
    local WORMBASE_VERSION="$1"
    get_wormbase_data $WORMBASE_VERSION "functional_descriptions.txt.gz"
}

create_geneids_csv() {
    local WORMBASE_VERSION="$1"
    local FILE_PREFIX="c_elegans.PRJNA13758"
    local FILE_ROOT="geneIDs.txt"
    gene_ids_txt="${INPUT_DATA}/${FILE_PREFIX}.${WORMBASE_VERSION}.${FILE_ROOT}"
    # Create GeneIDs.csv
    gene_ids_csv=$(echo "$gene_ids_txt" | sed 's/.\{4\}$//') # remove .txt
    gene_ids_csv="${gene_ids_csv}.csv"                       # add .csv

    # Drop the first column and Only include Live genes
    awk -F',' '$5=="Live" {print $2","$3","$4","$6}' "$gene_ids_txt" > "$gene_ids_csv"
    # Add Header line  
    sed -i '1iWormbase_Id,Gene_name,Sequence_id,Gene_Type' "$gene_ids_csv"
    echo created $gene_ids_csv    
}

# Get GeneId data from the old version used for WormCat and the latest version on wormbase
get_geneids "WS291"
create_geneids_csv "WS291"

# Get the get_functional_descriptions of genes for the lastest Wormbase version
# This is cool but we use the API as it has more data
#get_functional_descriptions "WS291"


2024-03-13 10:12:41 URL: ftp://ftp.wormbase.org/pub/wormbase/releases/WS291/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WS291.geneIDs.txt.gz [416663] -> "./input_data/c_elegans.PRJNA13758.WS291.geneIDs.txt.gz" [1]


created ./input_data/c_elegans.PRJNA13758.WS291.geneIDs.csv


In [28]:
import pandas as pd
base_dir = './input_data/'
wormbase_version = 'WS291'
wormbase_ids_df = pd.read_csv(f"{base_dir}/c_elegans.PRJNA13758.{wormbase_version}.geneIDs.csv") 
wormbase_ids_df = wormbase_ids_df[['Wormbase_Id', 'Sequence_id']]
wormbase_ids_df.head()

Unnamed: 0,Wormbase_Id,Sequence_id
0,WBGene00000001,Y110A7A.10
1,WBGene00000002,F27C8.1
2,WBGene00000003,F07C3.7
3,WBGene00000004,F52H2.2
4,WBGene00000005,T13A10.10


In [29]:
# Read in the Candidate_Genes extracted from from Brendans Spreadsheet
# NOTE: Change Excel Root!!
input_dir="./input_data"
excel_root="AMYS_LIST"
xlsx_file_nm = f'{input_dir}/{excel_root}.xlsx'
genes_xlsx = pd.ExcelFile(xlsx_file_nm)
sheet_names = genes_xlsx.sheet_names
sheet_names

['sams-1 ALL genes',
 'sams-1 UP',
 'sams-1 DOWN',
 'set-2 ALL',
 'set-2 UP',
 'set-2 DOWN',
 'set-16 ALL',
 'set-16 UP',
 'set-16 DOWN']

In [30]:
# Create a dictionary of each candidate gene set with the aligned Wormbase_ids
genes_dfs = {}
for sheet_name in sheet_names:
    sheet_df = pd.read_excel(xlsx_file_nm, sheet_name=sheet_name)
    genes_dfs[sheet_name] = sheet_df


In [86]:
# NOTE: Change gene_set_nm!!
gene_set_nm='sams-1 DOWN'
gene_set_df = genes_dfs[gene_set_nm]
gene_set_df.head()

Unnamed: 0,ID,Control (1),Control (2),sams-1 (1),sams-1 (2),padj,log2FoldChange,pvalue,foldChange,log10padj
0,T24B8.5,1758.141792,2406.021086,71.492368,83.648968,2.33625e-43,-4.715853,1.013558e-46,0.038053,42.631481
1,Y19D10B.7,508.698761,583.643646,0.029845,0.087327,2.772048e-30,-7.809543,2.4052480000000003e-33,0.004458,29.557199
2,F15E11.15b,2240.374097,3453.791723,70.485389,15.380046,9.358898999999999e-30,-5.652186,1.044067e-32,0.019885,29.028775
3,F15E11.12a,443.924933,641.613937,13.11805,2.129393,2.28913e-24,-5.598138,4.256208e-27,0.020644,23.640329
4,F15E11.1,2534.332113,3795.820544,117.284369,26.07917,8.425388999999999e-24,-5.115304,1.7754149999999998e-26,0.02885,23.07441


In [87]:
# Adjust sequence id to see if you can find more wormbase ids
def adjust_id(row):
    if len(str(row['Wormbase_Id_sav'])) == 14:
        return row['ID'] # The ID was a hit Just Copy it over
    elif row['ID'][-1] in ['a', 'b', 'c', 'd']:
        return row['ID'][:-1] # If the id ends in any of these letters drop the leter and copy it over
    else:
        return row['ID'][:row['ID'].rfind('.')] #Look for the last '.' and remove the trail number
    

In [88]:
def process_ids(ids_df):
    if 'Sequence_id' in ids_df.columns:
        ids_df = ids_df.drop(columns=['Sequence_id'])
    if 'Wormbase_Id_sav' in ids_df.columns:
        ids_df = ids_df.drop(columns=['Wormbase_Id_sav'])
    
    merged_df = ids_df.merge(wormbase_ids_df, left_on='ID_adj', right_on='Sequence_id', how='left')

    hits_df = merged_df[merged_df['Wormbase_Id'].notna()]
    misses_df = merged_df[merged_df['Wormbase_Id'].isna()]
    print(f"{len(hits_df)=} {len(misses_df)=}")
    
    #merged_df = merged_df[['ID', 'Wormbase_Id', 'ID_adj']]
    merged_df['ID'] = merged_df['ID_adj']
    merged_df = merged_df.rename(columns={'Wormbase_Id': 'Wormbase_Id_sav'})
    return merged_df.copy()


In [89]:
gene_set_df['ID_adj'] = gene_set_df['ID']
adjusted_df = process_ids(gene_set_df)
adjusted_df.head()

len(hits_df)=1490 len(misses_df)=831


Unnamed: 0,ID,Control (1),Control (2),sams-1 (1),sams-1 (2),padj,log2FoldChange,pvalue,foldChange,log10padj,ID_adj,Wormbase_Id_sav,Sequence_id
0,T24B8.5,1758.141792,2406.021086,71.492368,83.648968,2.33625e-43,-4.715853,1.013558e-46,0.038053,42.631481,T24B8.5,WBGene00011979,T24B8.5
1,Y19D10B.7,508.698761,583.643646,0.029845,0.087327,2.772048e-30,-7.809543,2.4052480000000003e-33,0.004458,29.557199,Y19D10B.7,WBGene00021236,Y19D10B.7
2,F15E11.15b,2240.374097,3453.791723,70.485389,15.380046,9.358898999999999e-30,-5.652186,1.044067e-32,0.019885,29.028775,F15E11.15b,,
3,F15E11.12a,443.924933,641.613937,13.11805,2.129393,2.28913e-24,-5.598138,4.256208e-27,0.020644,23.640329,F15E11.12a,,
4,F15E11.1,2534.332113,3795.820544,117.284369,26.07917,8.425388999999999e-24,-5.115304,1.7754149999999998e-26,0.02885,23.07441,F15E11.1,WBGene00017490,F15E11.1


In [92]:
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# NOTE VERY IMPORTANT REPEAT THE EXECUTION OF THIS CELL UNTILL THE DATA CONVERGES
# Rerun this cell until the len(misses_df)= converges to a number
adjusted_df['ID_adj'] = adjusted_df.apply(adjust_id, axis=1)
adjusted_df = process_ids(adjusted_df)
adjusted_df.head()

len(hits_df)=2233 len(misses_df)=95


Unnamed: 0,ID,Control (1),Control (2),sams-1 (1),sams-1 (2),padj,log2FoldChange,pvalue,foldChange,log10padj,ID_adj,Wormbase_Id_sav,Sequence_id
0,T24B8.5,1758.141792,2406.021086,71.492368,83.648968,2.33625e-43,-4.715853,1.013558e-46,0.038053,42.631481,T24B8.5,WBGene00011979,T24B8.5
1,Y19D10B.7,508.698761,583.643646,0.029845,0.087327,2.772048e-30,-7.809543,2.4052480000000003e-33,0.004458,29.557199,Y19D10B.7,WBGene00021236,Y19D10B.7
2,F15E11.15,2240.374097,3453.791723,70.485389,15.380046,9.358898999999999e-30,-5.652186,1.044067e-32,0.019885,29.028775,F15E11.15,WBGene00017501,F15E11.15
3,F15E11.12,443.924933,641.613937,13.11805,2.129393,2.28913e-24,-5.598138,4.256208e-27,0.020644,23.640329,F15E11.12,WBGene00017498,F15E11.12
4,F15E11.1,2534.332113,3795.820544,117.284369,26.07917,8.425388999999999e-24,-5.115304,1.7754149999999998e-26,0.02885,23.07441,F15E11.1,WBGene00017490,F15E11.1


In [84]:
# # DID YOU CONVERGE THE DATA!!!
# from IPython.display import display, Javascript

# # Define the message to display
# message = "DID YOU CONVERGE THE DATA!!!"

# # Create and display the JavaScript to show the message
# display(Javascript(f"alert('{message}')"))

<IPython.core.display.Javascript object>

In [93]:
# Note Important to only run Once
if 'Sequence_id' in adjusted_df.columns:
    adjusted_df = adjusted_df.drop(columns=['Sequence_id'])

adjusted_df = adjusted_df.rename(columns={'ID': 'Sequence_id', 'ID_adj': 'Sequence_id_adj'})
adjusted_df = adjusted_df.rename(columns={'Wormbase_Id_sav': 'ID'})
columns = ['ID', 'Sequence_id', 'Sequence_id_adj'] + [col for col in adjusted_df.columns if col not in ['ID', 'Sequence_id', 'Sequence_id_adj']]
adjusted_df = adjusted_df[columns]
adjusted_df

Unnamed: 0,ID,Sequence_id,Sequence_id_adj,Control (1),Control (2),sams-1 (1),sams-1 (2),padj,log2FoldChange,pvalue,foldChange,log10padj
0,WBGene00011979,T24B8.5,T24B8.5,1758.141792,2406.021086,71.492368,83.648968,2.336250e-43,-4.715853,1.013558e-46,0.038053,42.631481
1,WBGene00021236,Y19D10B.7,Y19D10B.7,508.698761,583.643646,0.029845,0.087327,2.772048e-30,-7.809543,2.405248e-33,0.004458,29.557199
2,WBGene00017501,F15E11.15,F15E11.15,2240.374097,3453.791723,70.485389,15.380046,9.358899e-30,-5.652186,1.044067e-32,0.019885,29.028775
3,WBGene00017498,F15E11.12,F15E11.12,443.924933,641.613937,13.118050,2.129393,2.289130e-24,-5.598138,4.256208e-27,0.020644,23.640329
4,WBGene00017490,F15E11.1,F15E11.1,2534.332113,3795.820544,117.284369,26.079170,8.425389e-24,-5.115304,1.775415e-26,0.028850,23.074410
...,...,...,...,...,...,...,...,...,...,...,...,...
2323,WBGene00011000,R03G8.6,R03G8.6,641.282227,407.404066,432.750932,118.278570,1.170874e-01,-0.980753,5.429885e-02,0.506715,0.931490
2324,WBGene00219575,D1046.17,D1046.17,194.799725,222.669824,147.646577,96.235964,1.173728e-01,-0.865055,5.445631e-02,0.549025,0.930433
2325,WBGene00008684,F11A10.3,F11A10.3,0.041291,40.317686,0.010830,0.031515,1.179829e-01,-1.567710,5.488562e-02,0.337343,0.928181
2326,WBGene00016346,C33E10.2,C33E10.2,159.749299,136.211451,103.663821,41.853206,1.180598e-01,-1.045454,5.495309e-02,0.484493,0.927898


In [94]:
base_dir = './output_data'
out_file_nm = f"{excel_root}_{gene_set_nm}"
out_file_nm = out_file_nm.replace(' ', '_').lower()
adjusted_df = adjusted_df.dropna(subset=['ID'])
adjusted_df.to_csv(f"{base_dir}/{out_file_nm}.csv", index=False)
print(f"{base_dir}/{out_file_nm}.csv")

./output_data/amys_list_sams-1_down.csv
