## This notebook takes the entire saccharomyces cerevisiae genome as input and subtracts the essential genes leaving non-essential ones. It outputs degannotation-ne.dat, degaa-ne.dat, and degseq-ne.dat

In [1]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

organism = "Saccharomyces cerevisiae"
deg_ac = "DNEGSCER"
refseq = "559292"

names = ["#DEG_AC", "#Gene_Name", "#Gene_Ref", "#COG", "#Class", "#Function", "#Organism", "#Refseq", "#Condition"]

df = pd.read_csv('processed_ess/degannotation-e.dat', sep="\t", header=0, names=names)
essential_gene_names = list(df["#Gene_Name"])

In [2]:
# Helper functions
def create_new_fasta_record(seq, name, desc=""):
    return SeqRecord(seq, id=name, name=name, description=desc)

def extract_substring_between(string_before, string_after, whole_string):
    start = whole_string.find(string_before)
    s = str()
    
    if start != -1:
        substring = whole_string.split(string_before, 1)[1]
        end_index = substring.find(string_after)
        
        # Sometimes the substring is the last substring in the string and there is no end_index
        if end_index != -1:
            s = substring[0: end_index]
        else:
            s = substring
        
        # Sometimes there is a . in the substring. This is the gene's version e.g AC011476.11 and AC011476.10
        # I am discarding anything after the period, keeping the gene name
        dot_index = s.find(".")
        if dot_index != -1:
            s = s[0: dot_index]

    return s if len(s) > 0 else "-"
    
def get_gene_name(gene_name, number, nzeros):
    return gene_name + str(number).zfill(nzeros)

# Generate degseq-ne.dat file. Subtract Essential dataset from the genome Sequence, resulting in non-essential dataset

In [3]:
def subtract_and_write_fasta(minuend_path, subtrahend_path, difference_path, organism):
    # Our complete dataset
    fasta_sequences = SeqIO.parse(open(minuend_path), 'fasta')
    
    already_added = dict()
    
    # Smaller dataset to subtract from complete dataset
    essential = pd.read_table(subtrahend_path)
    essential = essential.loc[essential["#Organism"] == organism]
    essential_gene_names = list(essential["#Gene_Ref"])
    count = 0

    # Output
    with open(difference_path, "w") as f:
        for record in fasta_sequences:
            desc = record.description
            name = record.name
            
            if "_mRNA" in name:
                name = name.replace("_mRNA", "")
            
            if "gene_biotype:protein_coding" in desc and "transcript_biotype:protein_coding" in desc:
                # eg gene:YBR024W gene_biotype:protein_coding - extract YBR024W
                gene_ref = extract_substring_between("gene_symbol:", " ", desc)
                
                # Check that length of this sequence is divisible by 3 (length of a codon)
                if gene_ref != "-" and gene_ref not in essential_gene_names and gene_ref not in already_added and len(str(record.seq)) % 3 == 0:
                    already_added[gene_ref] = True
                    count += 1
                    SeqIO.write(create_new_fasta_record(record.seq, name, desc), f, "fasta")
                    
    print(count)

In [4]:
subtract_and_write_fasta("raw_data/scer.cds.all.sorted.fa", "processed_ess/degannotation-e.dat", "processed_ness/degseq-ne.dat", organism)

4020


# Generate degaa-ne.dat file

In [5]:
def translate_dna_seq(fasta_input_dna_file, fasta_output_path):
    fasta_sequences = SeqIO.parse(open(fasta_input_dna_file), 'fasta')

    with open(fasta_output_path, "w") as out:
        for record in fasta_sequences:
            desc = record.description
            name = record.name
            seq = record.seq

            aa_seq = seq.translate(stop_symbol="")
            
            SeqIO.write(create_new_fasta_record(aa_seq, name, desc), out, "fasta")
            
translate_dna_seq("processed_ness/degseq-ne.dat", "processed_ness/degaa-ne.dat")

# Generate degannotation-ne.dat File

In [6]:
fasta_sequences = SeqIO.parse(open("processed_ness/degseq-ne.dat"), 'fasta')

gene_names = list()
gene_refs = list()
functions = list()
ref_seqs = list()

for record in fasta_sequences:
    desc = record.description
    name = record.name
    
    gene_ref = extract_substring_between("gene_symbol:", " ", desc)
    
    gene_refs.append(gene_ref)

    function = extract_substring_between("description:", "[", desc)
    functions.append(function)

    gene_names.append(name)

num_rows = len(gene_names)
deg_acs = [deg_ac for i in range(num_rows)]
organisms = [organism for i in range(num_rows)]
ref_seqs = [refseq for i in range(num_rows)]
dashes = ["-" for i in range(num_rows)] # #COG, #Class, #Conditions

cols = list(zip(deg_acs, gene_names, gene_refs, dashes, dashes, functions, organisms, ref_seqs, dashes))
new_df = pd.DataFrame(cols, columns=["#DEG_AC", "#Gene_Name", "#Gene_Ref", "#COG", "#Class",
                                     "#Function", "#Organism", "#Refseq", "#Condition"])

new_df.to_csv("processed_ness/degannotation-ne.dat", sep ="\t", index=False)