# Removing MTRN2Ls from Human Refrence

We are going to remove the following transcripts from the human reference (from fasta, transcript annotation and transcript length files.)

  * MTRNR2L1-201  
  * MTRNR2L10-201   
  * MTRNR2L11-201  
  * MTRNR2L12-201 
  * MTRNR2L13-201
  * MTRNR2L3-201    
  * MTRNR2L4-201  
  * MTRNR2L5-201   
  * MTRNR2L6-201  
  * MTRNR2L7-201  
  * MTRNR2L8-201

In [1]:
import gzip

In [2]:
class FastaEntry:
    '''
    Given a header and a sequence, creates a fasta entry.
    The string representation of the fasta entry consists
    of 50 nts per line.
    Do not change the chunk size (= 50)
    TODO : Consider making header and sequence properties
    Also consider putting more checks here for the input
    '''
    def __init__(self , header , sequence ):
        self.header   = header
        self.sequence = sequence

    def reverse_complement(self):
        complements = {"A" : "T" , "a" : "t" ,
                   "C" : "G" , "c" : "g" ,
                   "G" : "C" , "g" : "c" ,
                   "T" : "A" , "t" : "a" ,
                   "N" : "N" , "n" : "n"}
        result = list()

        for i in range(len(self.sequence) - 1 , -1 , -1 ):
            try:
                result.append(complements[self.sequence[i]])
            except IndexError:
                error_message = "Invalid character (%s) in the fasta sequence with header \n" \
                                "%s"%(self.sequence[i] , self.header)
                raise IOError(error_message)
        self.sequence = "".join(result)


    def __str__(self ):
        chunk_size                  = 50 # Do not change this!
        result_list                 = [ '>' +  self.header ]
        sequence_size               = len(self.sequence)
        number_of_remaining_letters = sequence_size
        number_of_processed_letters = 0

        while number_of_remaining_letters > 0:
            if number_of_remaining_letters <= chunk_size:
                result_list.append(self.sequence[ number_of_processed_letters : ])
                number_of_remaining_letters = 0
                number_of_processed_letters = sequence_size
            else:
                new_number_of_processed_letters = number_of_processed_letters + chunk_size
                result_list.append(self.sequence[ number_of_processed_letters : new_number_of_processed_letters])
                number_of_remaining_letters -= chunk_size
                number_of_processed_letters  = new_number_of_processed_letters

        return("\n".join( result_list ) )

############################################################################################


class FastaFile:
    '''
    This object is used to read fasta files into FastaEntry objects.
    For writing fasta files, we only need FastaEntry objects and using
    their str function, we can convert them to string and write to files.
    Note that it can be used as a context manager as well.
    '''

    def __init__(self , file):
        myopen = open
        if file.endswith(".gz"):
            myopen = gzip.open

        if(file):
            self.f = myopen(file , "rt")
        else:
            self.f = stdin

        self.current_header = ""
        self.current_sequence = list()

    #####################################################

    def __enter__(self):
        return self

    #####################################################

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass


    ######################################################

    def __getitem__(self, index):

        for raw_line in self.f:
            line = raw_line.strip()
            if not line:
                continue

            if line[0] == ">":
                if not self.current_header:
                    self.current_header = (line[1:].split())[0]
                    self.current_sequence = list()
                else:
                    this_entry = FastaEntry(header = self.current_header , sequence = "".join(self.current_sequence) )
                    self.current_header = (line[1:].split())[0]
                    self.current_sequence = list()
                    return(this_entry)
            else:
                self.current_sequence.append(line)

        # this returns the last entry
        if len(self.current_sequence) > 0:
            this_entry = FastaEntry(header = self.current_header , sequence = "".join(self.current_sequence) )
            self.current_sequence = list()
            return(this_entry)

        raise IndexError

    #########################################################

    def __del__(self):
        self.f.close()

In [3]:
genes_to_be_removed = [\
    "MTRNR2L1-201", 
    "MTRNR2L10-201", 
    "MTRNR2L11-201", 
    "MTRNR2L12-201", 
    "MTRNR2L13-201",
    "MTRNR2L3-201",  
    "MTRNR2L4-201",  
    "MTRNR2L5-201",  
    "MTRNR2L6-201",  
    "MTRNR2L7-201", 
    "MTRNR2L8-201"]

## Fasta File 

First we list the transcripts that we will exclude

In [4]:
input_fasta_path  = "../transcriptome_old_v2/appris_human_v2_selected.fa.gz"
output_fasta_path = "appris_human_v2_after_filtering.fa.gz"

input_fasta = FastaFile(input_fasta_path)

for e in input_fasta:
    header_contents = e.header.split("|")
    
    if header_contents[4].find("MTRNR2L") >= 0:
        print(e.header)

ENST00000604646.1|ENSG00000270188.1|OTTHUMG00000184992.1|OTTHUMT00000469412.1|MTRNR2L11-201|MTRNR2L11|1552|UTR5:1-949|CDS:950-1024|UTR3:1025-1552|
ENST00000600213.3|ENSG00000269028.3|OTTHUMG00000175726.3|OTTHUMT00000430905.3|MTRNR2L12-201|MTRNR2L12|1049|UTR5:1-963|CDS:964-1038|UTR3:1039-1049|
ENST00000604093.2|ENSG00000270394.4|OTTHUMG00000184994.3|OTTHUMT00000469414.3|MTRNR2L13-201|MTRNR2L13|1445|UTR5:1-925|CDS:926-1000|UTR3:1001-1445|
ENST00000604952.1|ENSG00000270672.1|OTTHUMG00000184977.2|OTTHUMT00000469394.2|MTRNR2L6-201|MTRNR2L6|1447|UTR5:1-959|CDS:960-1034|UTR3:1035-1447|
ENST00000544824.2|ENSG00000256892.2|OTTHUMG00000184979.1|OTTHUMT00000469397.1|MTRNR2L7-201|MTRNR2L7|1535|UTR5:1-930|CDS:931-1005|UTR3:1006-1535|
ENST00000512524.4|ENSG00000249860.4|OTTHUMG00000184965.2|OTTHUMT00000469382.2|MTRNR2L5-201|MTRNR2L5|1687|UTR5:1-858|CDS:859-933|UTR3:934-1687|
ENST00000536684.3|ENSG00000255823.5|OTTHUMG00000184980.2|OTTHUMT00000469398.2|MTRNR2L8-201|MTRNR2L8|1293|UTR5:1-960|CDS:961-10

In [5]:
input_fasta = FastaFile(input_fasta_path)

with gzip.open(output_fasta_path, "wt") as output_stream:
    for e in input_fasta:
        header_contents = e.header.split("|")

        if header_contents[4].find("MTRNR2L") < 0:
            print(e, file = output_stream)

Now let's name sure that the output doesn't contain MTRNR2L transcripts:

In [6]:
output_fasta = FastaFile(output_fasta_path)

for e in output_fasta:
    header_contents = e.header.split("|")
    
    if header_contents[4].find("MTRNR2L") >= 0:
        print(e.header)

## Annotation File

In [7]:
old_annotation_file = "../transcriptome_old_v2/appris_human_v2_actual_regions.bed"
new_annotation_file = "appris_human_v2_after_filtering_regions.bed"

with open(old_annotation_file, "rt") as input_stream,\
     open(new_annotation_file, "wt") as output_stream:
    for this_line in input_stream:
        contents = this_line.split()
        
        if len(contents) < 5:
            continue
        
        if contents[0].find("MTRNR2L") < 0:
            print(this_line, file=output_stream, end="")

In [8]:
! grep MTRNR2L appris_human_v2_after_filtering_regions.bed

## Lengths File

In [9]:
old_lengths_file = "../transcriptome_old_v2/appris_human_v2_transcript_lengths.tsv"
new_lengths_file = "appris_human_v2_after_filtering_t_lengths.tsv"


with open(old_lengths_file, "rt") as input_stream,\
     open(new_lengths_file, "wt") as output_stream:
    for this_line in input_stream:
        contents = this_line.split()
        
        if len(contents) < 2:
            continue
        
        if contents[0].find("MTRNR2L") < 0:
            print(this_line, file=output_stream, end="")

In [10]:
! grep MTRNR2L appris_human_v2_after_filtering_t_lengths.tsv