# Read Wormbase Gene ID's File
---

Read the Gene ID's from Wormbase Repo and save some subset of the data for downstream testing.

In [None]:
import pandas as pd

In [None]:
# Set the output directory and create it if it does not exist
import os
output_dir = "./output_data"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
# Pull down the gene ids from Wormbase and unzip
!wget -P ./input_data ftp://ftp.wormbase.org/pub/wormbase/releases/WS287/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WS287.geneIDs.txt.gz
!gunzip ./input_data/c_elegans.PRJNA13758.WS287.geneIDs.txt.gz

In [None]:
# Note: The raw data does not have columns assigned
INPUT_DATA='./input_data'
gene_id_file=f'{INPUT_DATA}/c_elegans.PRJNA13758.WS287.geneIDs.txt'

columns = ["junk", "Wormbase_Id", "Gene_name", "Sequence_id", "Status", "Gene_Type"]
gene_id_df = pd.read_csv(gene_id_file,low_memory=False, header=None, names=columns)

In [None]:
gene_id_df 
print(f'There {len(gene_id_df):,} Genes in the file')
gene_id_df['Status'].value_counts()

In [None]:
# We just wanted to get the "Live" actively used Gene IDs
gene_id_live_df = gene_id_df.query("Status == 'Live'")
len(gene_id_live_df)

In [None]:
gene_id_live_df['Gene_Type'].value_counts()

In [None]:
# We only want the protein_coding_gene for this experiment
gene_id_live__pc_df = gene_id_live_df.query("Gene_Type == 'protein_coding_gene'")
len(gene_id_live__pc_df)

In [None]:
# Drop the columns that we do not need
gene_id_live__pc_df = gene_id_live__pc_df.drop(columns=['junk','Gene_Type','Status'], axis=1)
gene_id_live__pc_df

In [None]:
# Save the Wormbase "protein coding gene" ID's to a file
gene_id_live__pc_df.to_csv(f'{output_dir}/WB_protein_coding_genes.csv',index=False)