
### Analyzing some RNA-seq data
We want to compare results across 3 genotypes (wt, unc-17, eat-4). 
We have 3 replicates for each. 

* unc-17 animals are deficient in acetylcholine release 
* eat-4 animals are deficient in glutamate release. 

We are hoping to see whether genes involved in __redox homeostasis__ are impacted 

__Note:__ RNA-seq results for these animals aren’t currently available.

In [50]:
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns


In [51]:
%%bash
# Select Version from https://wormbase.org/ 
WORMBASE_VERSION="WS289"

base_dir="./input_data"


base_url="ftp://ftp.wormbase.org/pub/wormbase/releases/${WORMBASE_VERSION}/species/c_elegans/PRJNA13758/annotation"
gene_ids_txt="c_elegans.PRJNA13758.${WORMBASE_VERSION}.geneIDs.txt"
gene_ids_gz="${gene_ids_txt}.gz"

mkdir -p ${base_dir}
cd ${base_dir}

wget -nv "${base_url}/${gene_ids_gz}"
gunzip -f ${gene_ids_gz}


2023-09-25 07:51:31 URL: ftp://ftp.wormbase.org/pub/wormbase/releases/WS289/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WS289.geneIDs.txt.gz [416537] -> "c_elegans.PRJNA13758.WS289.geneIDs.txt.gz" [1]


In [52]:
# Import the GeneIDs data from Wormbase
WORMBASE_VERSION="WS289"
base_dir="./input_data"
gene_ids_txt=f"c_elegans.PRJNA13758.{WORMBASE_VERSION}.geneIDs.txt"

columns = ["Run_Code", "Wormbase_ID", "Gene_Name", "Sequence_ID", "Status", "Type"]
gene_ids_df = pd.read_csv(f"{base_dir}/{gene_ids_txt}",low_memory=False, header=None, names=columns)
print(f"{len(gene_ids_df):,} rows of data")

52,109 rows of data


In [53]:
# Keep only the live genes for further evaluation
gene_ids_df = gene_ids_df[gene_ids_df['Status']=='Live']
print(f"{len(gene_ids_df):,} rows of data")
gene_ids_df

49,176 rows of data


Unnamed: 0,Run_Code,Wormbase_ID,Gene_Name,Sequence_ID,Status,Type
0,6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene
1,6239,WBGene00000002,aat-1,F27C8.1,Live,protein_coding_gene
2,6239,WBGene00000003,aat-2,F07C3.7,Live,protein_coding_gene
3,6239,WBGene00000004,aat-3,F52H2.2,Live,protein_coding_gene
4,6239,WBGene00000005,aat-4,T13A10.10,Live,protein_coding_gene
...,...,...,...,...,...,...
52104,6239,WBGene00306126,cone-1,Y54F10AM.16,Live,protein_coding_gene
52105,6239,WBGene00306131,,Y34B4A.20,Live,protein_coding_gene
52106,6239,WBGene00306132,,F54D10.10,Live,protein_coding_gene
52107,6239,WBGene00306133,azyx-1,F42G4.11,Live,protein_coding_gene


In [54]:
# Replace NaN values in "Gene_Name" with values from "Sequence_ID"
gene_ids_df["Gene_Name"].fillna(gene_ids_df["Sequence_ID"], inplace=True)
print(f"{len(gene_ids_df):,} rows of data")
gene_ids_df

49,176 rows of data


Unnamed: 0,Run_Code,Wormbase_ID,Gene_Name,Sequence_ID,Status,Type
0,6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene
1,6239,WBGene00000002,aat-1,F27C8.1,Live,protein_coding_gene
2,6239,WBGene00000003,aat-2,F07C3.7,Live,protein_coding_gene
3,6239,WBGene00000004,aat-3,F52H2.2,Live,protein_coding_gene
4,6239,WBGene00000005,aat-4,T13A10.10,Live,protein_coding_gene
...,...,...,...,...,...,...
52104,6239,WBGene00306126,cone-1,Y54F10AM.16,Live,protein_coding_gene
52105,6239,WBGene00306131,Y34B4A.20,Y34B4A.20,Live,protein_coding_gene
52106,6239,WBGene00306132,F54D10.10,F54D10.10,Live,protein_coding_gene
52107,6239,WBGene00306133,azyx-1,F42G4.11,Live,protein_coding_gene


In [55]:
# Drop the "Run_Code" and "Status" columns
columns_to_drop = ["Run_Code", "Status"]
gene_ids_df = gene_ids_df.drop(columns=columns_to_drop)
print(f"{len(gene_ids_df):,} rows of data")

49,176 rows of data


In [56]:
gene_ids_csv=f"c_elegans.PRJNA13758.{WORMBASE_VERSION}.geneIDs.csv"
gene_ids_df.to_csv(f"{output_data}/{gene_ids_csv}", index=False)

In [57]:
# Read select wormbase IDs into a data frame
columns = ["Wormbase_ID"]
working_gene_ids_df = pd.read_csv(f"{base_dir}/wormbase_ids.txt",low_memory=False, header=None, names=columns)
print(f"{len(working_gene_ids_df):,} rows of data")

364 rows of data


In [58]:
working_gene_ids_df = pd.merge(working_gene_ids_df, gene_ids_df, on="Wormbase_ID", how="left")
print(f"{len(working_gene_ids_df):,} rows of data")

364 rows of data


In [59]:
working_gene_ids_df

Unnamed: 0,Wormbase_ID,Gene_Name,Sequence_ID,Type
0,WBGene00019449,K06H6.1,K06H6.1,protein_coding_gene
1,WBGene00008477,clec-17,E03H4.10,protein_coding_gene
2,WBGene00015760,C14C6.6,C14C6.6,protein_coding_gene
3,WBGene00000749,col-176,ZC373.7,protein_coding_gene
4,WBGene00019623,K10C9.1,K10C9.1,protein_coding_gene
...,...,...,...,...
359,WBGene00010001,F53F8.4,F53F8.4,protein_coding_gene
360,WBGene00009340,best-14,F32G8.4,protein_coding_gene
361,WBGene00012545,Y37D8A.3,Y37D8A.3,protein_coding_gene
362,WBGene00007530,C11H1.5,C11H1.5,protein_coding_gene


In [60]:
output_data="./output_data"
os.makedirs(output_data, exist_ok=True)
working_gene_ids_df["Sequence_ID"].to_csv(f"{output_data}/sequence_ids.txt", index=False, header=["Sequence_ID"])