# Reverse Translation
Frederick Abi Chahine

In [1]:
#!/usr/bin/env python

#Final version of part 2

#Program Description:
#This progran takes a peptide sequence as a user input and then displays all the mRNA sequences that can be formed 
#from this peptide sequence along with all the %GC of each mRNA from lowest % to highest %. Then, the program finds
#the mRNAs that have a %GC closest to 50% and displays them as the most probable mRNAs
#the program terminates after displaying the execution time.

import re                     #regex will be used to ensure that the user input a proper peptide sequence.
from datetime import datetime #will be used in order to display the run time

def generateAllPossibilities(pointer, matrix, mrna_string, possibilities_list):
    #This function is here in order to generate every possible unique mrna sequence from the combination of
    #the codons from the matrix created in function reverseTranscribe. It respects the order of the matrix,
    #in order to generate the appropriate mRNAs for the given peptide sequence.
    
    if pointer==len(matrix):
        possibilities_list.append(mrna_string)
    
    else:
        for codon in matrix[pointer]:
            new_pointer= pointer + 1
            new_mrna_string= mrna_string + codon
            generateAllPossibilities(new_pointer, matrix, new_mrna_string, possibilities_list)
    
    return possibilities_list

def reverseTranscribe(pro_seq):
    #This function takes the protein sequence that the user input as an argument and then generates a matrix in
    #which the mother list has sub-lists, and every sub-list has all possible codons for a specific amino acid.
    #It then computes the # of appropriate mrna sequences that can be generated from this sequence, and then 
    #returns the generated matrix.
    
    codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
                 "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",          # UCC is lower case 's'. Values may differ if changed to upper case 'S'
                 "UAU":"Y", "UAC":"Y", "UAA":"STOP", "UAG":"STOP",
                 "UGU":"C", "UGC":"C", "UGA":"STOP", "UGG":"W",
                 "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
                 "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
                 "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
                 "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
                 "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
                 "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
                 "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
                 "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
                 "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
                 "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
                 "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
                 "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G"}
    
    matrix= []
    
    for i in range(len(pro_seq)):
        matrix.append([key for key,val in codon_map.items() if val == pro_seq[i]])
    
    return matrix #every sub-list has all codons for a specific amino acid

def computePercentGC(all_mrnas):
    #This method take the previously created list that has all the mRNA sequences in it, and creates a new list that
    #stores all the %GC for each mRNA respectively (same indices). Then, with the help of zip() we are able to sort 
    #the %GC list and in turn shadow sort the mRNA list with respect to the indices => every mRNA will be at the 
    #same index of its %GC. We need to sort in order to display the mRNAs with their %GC from lowest to highest.
    #Finally, we return both of the lists to later on display them on the screen for the user.
    
    percent_GC= [] #this list will store all the %GC of every mRNA
    for mrna in all_mrnas: 
        #loops through the mRNA list and calculates the %GC for each mRNA to store it in percent_GC
        gc_per= round(((mrna.count('G')+mrna.count('C'))/len(mrna))*100, 2)
        percent_GC.append(gc_per)
    
    #print(percent_GC)
    #print(all_mrnas)
    
    zipped_lists = zip(percent_GC, all_mrnas)  #first is what it sorts (the %) second goes with it (the mRNA)
    sorted_pairs = sorted(zipped_lists)        #shadow sorting the mRNA while sorting the %GC
    tuples = zip(*sorted_pairs)
    percent_GC, all_mrnas = [ list(tuple) for tuple in  tuples]
    
    return all_mrnas, percent_GC
    
def main():
    
    pro_seq= input("Enter protein sequence: ") #ACDEFGHIKLMNPQRSTVWY   ONLYYYY

    while re.findall(r'[^AC-IK-NP-TVWY]', pro_seq): #To ensure that the user enters any correct combination of amino acids, and avoids any typos/wrong letters/wrong case of letters
        print("\n**You entered an invalid letter/character in your sequence. Only input \"ACDEFGHIKLMNPQRSTVWYs\".**\n")
        pro_seq= input("Enter protein sequence:")
    
    start=datetime.now()                                     #this stores the current time in order to subtract it from the time at the end of execution => we get run time
    matrix = reverseTranscribe(pro_seq)                      #invokes the method in order to create the 2D list
    all_mrnas= generateAllPossibilities(0, matrix, "", [])   #invokes the recursive method to generate all combinations of mRNA
    all_mrnas, percent_GC = computePercentGC(all_mrnas)      #invokes the method that computes and sorts the mrnas with their respective %GC
    print()                                                  #to improve display
    
    for i in range(len(percent_GC)):                         #this loop simply displays the content to the screen
        print(all_mrnas[i], ": %GC =", percent_GC[i])
    
    print("\n---> All possible combinations:", len(all_mrnas), "\n")  #displays the number of mRNA combinations
    print("---> The most probable mRNA sequence(s) is/are: (The ones with %GC closest to 50%)\n")
                
    index_array= []                                    #this array will store all the indices of the mRNAs with %GC closest to 50 (or 50 itself)
    result= abs(percent_GC[0]-50)                      #this will be an initializer for the if condition inside the loop (we get absolute value since we do not want the sign to affect our code)
    for i in range(len(percent_GC)):
        if abs(percent_GC[i]-50) < result:             #the percent - 50 will give us the number it is away from 50, so if the number is smaller, it means that it is closer to 50, if it is 0 then it is 50
            result= abs(percent_GC[i]-50)
            index_array= []                            #we have to refresh the index array if it enters this statement in order to remove all previous indices that were assumed closest to 50
            index_array.append(i)
        elif abs(percent_GC[i]-50) == result:          #else if the distance is equal to the smallest then we simply append it to the array to display later on
            index_array.append(i)
            
    for index in index_array:                          #this loop simply displays onto the screen all the mRNA sequences with %GC closest to 50 (or 50 itself)
        print(all_mrnas[index], ": %GC =", percent_GC[index])
    
    print('\n( Run Time: ', datetime.now()-start, ")") #this displays the execution time of this code

main()

Enter protein sequence: sATTS

**You entered an invalid letter/character in your sequence. Only input "ACDEFGHIKLMNPQRSTVWYs".**

Enter protein sequence:SATTS

AGUGCAACAACAAGU : %GC = 40.0
AGUGCAACAACAUCA : %GC = 40.0
AGUGCAACAACAUCU : %GC = 40.0
AGUGCAACAACUAGU : %GC = 40.0
AGUGCAACAACUUCA : %GC = 40.0
AGUGCAACAACUUCU : %GC = 40.0
AGUGCAACUACAAGU : %GC = 40.0
AGUGCAACUACAUCA : %GC = 40.0
AGUGCAACUACAUCU : %GC = 40.0
AGUGCAACUACUAGU : %GC = 40.0
AGUGCAACUACUUCA : %GC = 40.0
AGUGCAACUACUUCU : %GC = 40.0
AGUGCUACAACAAGU : %GC = 40.0
AGUGCUACAACAUCA : %GC = 40.0
AGUGCUACAACAUCU : %GC = 40.0
AGUGCUACAACUAGU : %GC = 40.0
AGUGCUACAACUUCA : %GC = 40.0
AGUGCUACAACUUCU : %GC = 40.0
AGUGCUACUACAAGU : %GC = 40.0
AGUGCUACUACAUCA : %GC = 40.0
AGUGCUACUACAUCU : %GC = 40.0
AGUGCUACUACUAGU : %GC = 40.0
AGUGCUACUACUUCA : %GC = 40.0
AGUGCUACUACUUCU : %GC = 40.0
UCAGCAACAACAAGU : %GC = 40.0
UCAGCAACAACAUCA : %GC = 40.0
UCAGCAACAACAUCU : %GC = 40.0
UCAGCAACAACUAGU : %GC = 40.0
UCAGCAACAACUUCA : %GC = 40.0

AGUGCGACUACAUCG : %GC = 53.33
AGUGCGACUACCAGU : %GC = 53.33
AGUGCGACUACCUCA : %GC = 53.33
AGUGCGACUACCUCU : %GC = 53.33
AGUGCGACUACGAGU : %GC = 53.33
AGUGCGACUACGUCA : %GC = 53.33
AGUGCGACUACGUCU : %GC = 53.33
AGUGCGACUACUAGC : %GC = 53.33
AGUGCGACUACUUCC : %GC = 53.33
AGUGCGACUACUUCG : %GC = 53.33
AGUGCUACAACCAGC : %GC = 53.33
AGUGCUACAACCUCC : %GC = 53.33
AGUGCUACAACCUCG : %GC = 53.33
AGUGCUACAACGAGC : %GC = 53.33
AGUGCUACAACGUCC : %GC = 53.33
AGUGCUACAACGUCG : %GC = 53.33
AGUGCUACCACAAGC : %GC = 53.33
AGUGCUACCACAUCC : %GC = 53.33
AGUGCUACCACAUCG : %GC = 53.33
AGUGCUACCACCAGU : %GC = 53.33
AGUGCUACCACCUCA : %GC = 53.33
AGUGCUACCACCUCU : %GC = 53.33
AGUGCUACCACGAGU : %GC = 53.33
AGUGCUACCACGUCA : %GC = 53.33
AGUGCUACCACGUCU : %GC = 53.33
AGUGCUACCACUAGC : %GC = 53.33
AGUGCUACCACUUCC : %GC = 53.33
AGUGCUACCACUUCG : %GC = 53.33
AGUGCUACGACAAGC : %GC = 53.33
AGUGCUACGACAUCC : %GC = 53.33
AGUGCUACGACAUCG : %GC = 53.33
AGUGCUACGACCAGU : %GC = 53.33
AGUGCUACGACCUCA : %GC = 53.33
AGUGCUACGA

AGCGCAACCACAUCC : %GC = 60.0
AGCGCAACCACAUCG : %GC = 60.0
AGCGCAACCACCAGU : %GC = 60.0
AGCGCAACCACCUCA : %GC = 60.0
AGCGCAACCACCUCU : %GC = 60.0
AGCGCAACCACGAGU : %GC = 60.0
AGCGCAACCACGUCA : %GC = 60.0
AGCGCAACCACGUCU : %GC = 60.0
AGCGCAACCACUAGC : %GC = 60.0
AGCGCAACCACUUCC : %GC = 60.0
AGCGCAACCACUUCG : %GC = 60.0
AGCGCAACGACAAGC : %GC = 60.0
AGCGCAACGACAUCC : %GC = 60.0
AGCGCAACGACAUCG : %GC = 60.0
AGCGCAACGACCAGU : %GC = 60.0
AGCGCAACGACCUCA : %GC = 60.0
AGCGCAACGACCUCU : %GC = 60.0
AGCGCAACGACGAGU : %GC = 60.0
AGCGCAACGACGUCA : %GC = 60.0
AGCGCAACGACGUCU : %GC = 60.0
AGCGCAACGACUAGC : %GC = 60.0
AGCGCAACGACUUCC : %GC = 60.0
AGCGCAACGACUUCG : %GC = 60.0
AGCGCAACUACCAGC : %GC = 60.0
AGCGCAACUACCUCC : %GC = 60.0
AGCGCAACUACCUCG : %GC = 60.0
AGCGCAACUACGAGC : %GC = 60.0
AGCGCAACUACGUCC : %GC = 60.0
AGCGCAACUACGUCG : %GC = 60.0
AGCGCCACAACAAGC : %GC = 60.0
AGCGCCACAACAUCC : %GC = 60.0
AGCGCCACAACAUCG : %GC = 60.0
AGCGCCACAACCAGU : %GC = 60.0
AGCGCCACAACCUCA : %GC = 60.0
AGCGCCACAACCUC

UCCGCGACAACGUCA : %GC = 60.0
UCCGCGACAACGUCU : %GC = 60.0
UCCGCGACAACUAGC : %GC = 60.0
UCCGCGACAACUUCC : %GC = 60.0
UCCGCGACAACUUCG : %GC = 60.0
UCCGCGACCACAAGU : %GC = 60.0
UCCGCGACCACAUCA : %GC = 60.0
UCCGCGACCACAUCU : %GC = 60.0
UCCGCGACCACUAGU : %GC = 60.0
UCCGCGACCACUUCA : %GC = 60.0
UCCGCGACCACUUCU : %GC = 60.0
UCCGCGACGACAAGU : %GC = 60.0
UCCGCGACGACAUCA : %GC = 60.0
UCCGCGACGACAUCU : %GC = 60.0
UCCGCGACGACUAGU : %GC = 60.0
UCCGCGACGACUUCA : %GC = 60.0
UCCGCGACGACUUCU : %GC = 60.0
UCCGCGACUACAAGC : %GC = 60.0
UCCGCGACUACAUCC : %GC = 60.0
UCCGCGACUACAUCG : %GC = 60.0
UCCGCGACUACCAGU : %GC = 60.0
UCCGCGACUACCUCA : %GC = 60.0
UCCGCGACUACCUCU : %GC = 60.0
UCCGCGACUACGAGU : %GC = 60.0
UCCGCGACUACGUCA : %GC = 60.0
UCCGCGACUACGUCU : %GC = 60.0
UCCGCGACUACUAGC : %GC = 60.0
UCCGCGACUACUUCC : %GC = 60.0
UCCGCGACUACUUCG : %GC = 60.0
UCCGCUACAACCAGC : %GC = 60.0
UCCGCUACAACCUCC : %GC = 60.0
UCCGCUACAACCUCG : %GC = 60.0
UCCGCUACAACGAGC : %GC = 60.0
UCCGCUACAACGUCC : %GC = 60.0
UCCGCUACAACGUC

UCGGCAACGACGUCG : %GC = 66.67
UCGGCCACAACCAGC : %GC = 66.67
UCGGCCACAACCUCC : %GC = 66.67
UCGGCCACAACCUCG : %GC = 66.67
UCGGCCACAACGAGC : %GC = 66.67
UCGGCCACAACGUCC : %GC = 66.67
UCGGCCACAACGUCG : %GC = 66.67
UCGGCCACCACAAGC : %GC = 66.67
UCGGCCACCACAUCC : %GC = 66.67
UCGGCCACCACAUCG : %GC = 66.67
UCGGCCACCACCAGU : %GC = 66.67
UCGGCCACCACCUCA : %GC = 66.67
UCGGCCACCACCUCU : %GC = 66.67
UCGGCCACCACGAGU : %GC = 66.67
UCGGCCACCACGUCA : %GC = 66.67
UCGGCCACCACGUCU : %GC = 66.67
UCGGCCACCACUAGC : %GC = 66.67
UCGGCCACCACUUCC : %GC = 66.67
UCGGCCACCACUUCG : %GC = 66.67
UCGGCCACGACAAGC : %GC = 66.67
UCGGCCACGACAUCC : %GC = 66.67
UCGGCCACGACAUCG : %GC = 66.67
UCGGCCACGACCAGU : %GC = 66.67
UCGGCCACGACCUCA : %GC = 66.67
UCGGCCACGACCUCU : %GC = 66.67
UCGGCCACGACGAGU : %GC = 66.67
UCGGCCACGACGUCA : %GC = 66.67
UCGGCCACGACGUCU : %GC = 66.67
UCGGCCACGACUAGC : %GC = 66.67
UCGGCCACGACUUCC : %GC = 66.67
UCGGCCACGACUUCG : %GC = 66.67
UCGGCCACUACCAGC : %GC = 66.67
UCGGCCACUACCUCC : %GC = 66.67
UCGGCCACUA

UCUGCAACCACUUCA : %GC = 46.67
UCUGCAACCACUUCU : %GC = 46.67
UCUGCAACGACAAGU : %GC = 46.67
UCUGCAACGACAUCA : %GC = 46.67
UCUGCAACGACAUCU : %GC = 46.67
UCUGCAACGACUAGU : %GC = 46.67
UCUGCAACGACUUCA : %GC = 46.67
UCUGCAACGACUUCU : %GC = 46.67
UCUGCAACUACAAGC : %GC = 46.67
UCUGCAACUACAUCC : %GC = 46.67
UCUGCAACUACAUCG : %GC = 46.67
UCUGCAACUACCAGU : %GC = 46.67
UCUGCAACUACCUCA : %GC = 46.67
UCUGCAACUACCUCU : %GC = 46.67
UCUGCAACUACGAGU : %GC = 46.67
UCUGCAACUACGUCA : %GC = 46.67
UCUGCAACUACGUCU : %GC = 46.67
UCUGCAACUACUAGC : %GC = 46.67
UCUGCAACUACUUCC : %GC = 46.67
UCUGCAACUACUUCG : %GC = 46.67
UCUGCCACAACAAGU : %GC = 46.67
UCUGCCACAACAUCA : %GC = 46.67
UCUGCCACAACAUCU : %GC = 46.67
UCUGCCACAACUAGU : %GC = 46.67
UCUGCCACAACUUCA : %GC = 46.67
UCUGCCACAACUUCU : %GC = 46.67
UCUGCCACUACAAGU : %GC = 46.67
UCUGCCACUACAUCA : %GC = 46.67
UCUGCCACUACAUCU : %GC = 46.67
UCUGCCACUACUAGU : %GC = 46.67
UCUGCCACUACUUCA : %GC = 46.67
UCUGCCACUACUUCU : %GC = 46.67
UCUGCGACAACAAGU : %GC = 46.67
UCUGCGACAA

UCGGCAACCACAUCU : %GC = 53.33
UCGGCAACCACUAGU : %GC = 53.33
UCGGCAACCACUUCA : %GC = 53.33
UCGGCAACCACUUCU : %GC = 53.33
UCGGCAACGACAAGU : %GC = 53.33
UCGGCAACGACAUCA : %GC = 53.33
UCGGCAACGACAUCU : %GC = 53.33
UCGGCAACGACUAGU : %GC = 53.33
UCGGCAACGACUUCA : %GC = 53.33
UCGGCAACGACUUCU : %GC = 53.33
UCGGCAACUACAAGC : %GC = 53.33
UCGGCAACUACAUCC : %GC = 53.33
UCGGCAACUACAUCG : %GC = 53.33
UCGGCAACUACCAGU : %GC = 53.33
UCGGCAACUACCUCA : %GC = 53.33
UCGGCAACUACCUCU : %GC = 53.33
UCGGCAACUACGAGU : %GC = 53.33
UCGGCAACUACGUCA : %GC = 53.33
UCGGCAACUACGUCU : %GC = 53.33
UCGGCAACUACUAGC : %GC = 53.33
UCGGCAACUACUUCC : %GC = 53.33
UCGGCAACUACUUCG : %GC = 53.33
UCGGCCACAACAAGU : %GC = 53.33
UCGGCCACAACAUCA : %GC = 53.33
UCGGCCACAACAUCU : %GC = 53.33
UCGGCCACAACUAGU : %GC = 53.33
UCGGCCACAACUUCA : %GC = 53.33
UCGGCCACAACUUCU : %GC = 53.33
UCGGCCACUACAAGU : %GC = 53.33
UCGGCCACUACAUCA : %GC = 53.33
UCGGCCACUACAUCU : %GC = 53.33
UCGGCCACUACUAGU : %GC = 53.33
UCGGCCACUACUUCA : %GC = 53.33
UCGGCCACUA