In [41]:
from lookup import rna_codon_to_amino_acid
from tools import read_fasta_file, reverse_complement, find_indices_of_matches, transcribe_DNA_to_RNA, translate_rna_into_protein

In [34]:
def get_open_reading_frame(s):
    """
    Return a substring between start (including) and the first stop codon (not including), counting by 3 nucleotides.
    If such substring does not exist, return empty string.
    """
    try:
        start_index = s.index('ATG')
    except ValueError:
        print('Started codon not found')
        return ''
    
    codons = list()
    STOP_CODONS = ['TAA', 'TAG', 'TGA']
    for i in range(start_index, len(s), 3):
        codon = s[i:i+3]
        if codon in STOP_CODONS:
            return ''.join(codons)
        else:
            codons.append(codon)
    return ''

In [52]:
def translate_all_orfs_into_proteins(dna_string):
    """
    Return every distinct candidate protein string that can be translated from ORFs of the given DNA stirng.
    """
    # In the given DNA string, find all substrings which start from ANY occurence of the start codon 'ATG'
    # and go till the end of the DNA string.
    # Then find all such strings obtained from the reverse complement of the given DNA string.
    strings_starting_by_start_codon = list()
    START_CODON = 'ATG'
    for s in [dna_string, reverse_complement(dna_string)]:
        start_indices = find_indices_of_matches(s, START_CODON, base=0)
        for i in start_indices:
            strings_starting_by_start_codon.append(s[i:])
               
    # Finally, for all these candidate strings, we look for Open Reading Frames (ORF),
    # and translating them to protein strings
    proteins = list()
    for s in strings_starting_by_start_codon:
        orf = get_open_reading_frame(s)
        if len(orf) > 0:
            rna = transcribe_DNA_to_RNA(orf)
            protein = translate_rna_into_protein(rna)
            if protein not in proteins:
                proteins.append(protein)
    return proteins
    

In [53]:
def solve(path):
    d = read_fasta_file(path)
    first_dna = list(d.values())[0]
    proteins = translate_all_orfs_into_proteins(first_dna)
    print('\n'.join(proteins))

In [54]:
# Test
solve('./txt/rosalind_orf_test.txt')

M
MGMTPRLGLESLLE
MTPRLGLESLLE
MLLGSFRLIPKETLIQVAGSSPCNLS


In [55]:
# Submission
solve('./txt/rosalind_orf.txt')

MSNKEHHSPIN
MCVPNVIGLELKSLLQ
MSSV
MVQPHAPDKN
MPRTKTSYSGV
MSGFIWHCSTSPTVLRVGVACKRAETVLHMIAIFGIS
MALFNQPNSFKGWSGLQESGDGSPHDRNLRYLLIGVATPIRRYRRLRSWEWFCSIVKLCRGRYTRDFDFSMAV
MIAIFGIS
MGMVLFYR
MVLFYR
M
MAV
MDRPVLEVLVAARSLISAPEPLIDQAAQ
MHAGTNMSGRSNHYGKLHYMSNYSDQRTRMGNEN
MSGRSNHYGKLHYMSNYSDQRTRMGNEN
MSNYSDQRTRMGNEN
MGNEN
MLMLGLEK
MLGLEK
MNIYISSDRKWGSSGPYLVQPL
MG
MFI
MVYALLKSVLVSHARPLIRVVGHVMQFTVVIRPAAHVCPGVHNLLCCLIDQWFRC
MRVR
MQFTVVIRPAAHVCPGVHNLLCCLIDQWFRC
MFVPACIIYCAA
MWRTVSALLQATPTLKTVGLVEQCHIKPDISSRTARHTPE
MRLNHY
MTLGTHIRRY
MVLFVRHRTSFFLTTCRCLSS
MSFKLASRGL
