In [None]:
gencode = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'}

In [None]:
reversed = {A : [k for k, v in gencode.items() if v == A] for A in set(gencode.values())}
reversed

In [None]:
def my_amino2dna(amino):
  seq = ''
  for aa in amino:
    seq += random.choice(reversed[aa])
  return seq

In [None]:
def comp_inv(dna):
  return dna.lower().replace('a','T').replace('c', 'G').replace('g','C').replace('t', 'A')[::-1]

In [None]:
import random
def gen_prot(avg_sz):
  start, stop = 'M', '_'
  others = [k for k in reversed if k != stop]
  prt = get_random_str(avg_sz, others)
  return start + prt + stop

def random_intron(avg_sz):
  intron = get_random_str(avg_sz, "ACGT")
  intron.replace('ATG', 'ATT').replace('CAT','CAA')
  if intron.startswith('TG'): intron = 'T' + intron
  if intron.endswith('CA'): intron += 'G'
  return intron

def get_random_str(avg_sz, chars):
  intron = ''
  while random.random() > 1 / avg_sz:
    intron += random.choice(chars)
  return intron

def amino2dna(amino):
  return ''.join([random.choice(reversed[aa]) for aa in amino])

In [None]:
random_intron(99)

In [None]:
def get_dna1_3(num_prots = 25, avg_intron_size = 30, avg_prot_size = 50):
  dna = ''
  prots = []
  for p in range(num_prots):
    dna += random_intron(avg_intron_size)
    prot = gen_prot(avg_prot_size)
    dna += amino2dna(prot)
    prots.append(prot)
  dna += random_intron(avg_intron_size)
  return dna, prots

def get_dna(num_prots = 25, avg_intron_size = 30, avg_prot_size = 50):
  dna1, prots1 = get_dna1_3(num_prots = num_prots, avg_intron_size = avg_intron_size, avg_prot_size = avg_prot_size)
  dna2, prots2 = get_dna1_3(num_prots = num_prots, avg_intron_size = avg_intron_size, avg_prot_size = avg_prot_size)

  dna = dna1 + comp_inv(dna2)
  prots = set(prots1) | set(prots2)

  return dna, prots

In [None]:
def dna_valido(dna):
  return len([k for k in dna if k not in "ACGT"]) == 0
  #return len(dna.replace('A','').replace('C','')) == 0

def dna2amino(dna):
  assert dna_valido(dna), f"{dna} não é um pedaço de DNA válido!"
  return ''.join([gencode.get(dna[p : p + 3], '') for p in range(0, len(dna), 3)])

def get_orfs(dna):
  return [dna2amino(dna[p:]) for p in range(3)] + [dna2amino(comp_inv(dna)[p:]) for p in range(3)]

def get_orf_prot(orf):
  is_prot = False
  prot = ''
  prots = []
  for A in orf:
    if is_prot:
      prot += A
      if A == '_':
        prots.append(prot)
        is_prot = False
        prot = ''
    else:
      if A == 'M':
        is_prot = True
        prot = 'M'
  return set(prots)

def get_prots(dna):
  res = set()
  for prots in [get_orf_prot(orf) for orf in get_orfs(dna)]:
    res |= prots
  return res

Acontece que a sequência gerada contém todas as proteínas mas, em alguns casos, contidas em proteínas mais pequenas. Logo, os testes foram escritos de outra forma:
- Ou as proteínas foram encontradas, ou
- As proteínas estão contidas em proteínas maiores (eram estas as que não tinham sido encontradas)

In [None]:
assert sorted(get_orf_prot('MBJ_AMTH_TYMGPW_THMAMSA_TMY_')) == sorted(['MBJ_', 'MTH_', 'MGPW_', 'MAMSA_', 'MY_'])
dna, prot = get_dna(num_prots = 100)
all_prots = get_prots(dna)

# Este ciclo vai procurar se as proteínas que não foram encontradas não estão contidas nelas
for p in [p for p in prot if p not in all_prots]:
  print(p, [P[:P.find(p)] for P in all_prots if p in P])
  assert any(p in P for P in all_prots)


dna1 = my_amino2dna("MATFH_") + "TGAT" + my_amino2dna('M_')
dna2 = my_amino2dna("MQRSSFG_") + 'CG' + my_amino2dna("MLHYTAFG_") + 'GTAT' + my_amino2dna('MWYT_')
dna = 'TGGGT' + dna1 + comp_inv(dna2) + 'TGTGGTAAACC'

assert all(p in get_prots(dna) for  p in "MATFH_ M_ MQRSSFG_ MLHYTAFG_ MWYT_".split())

In [None]:
import unittest

class TestBioInf(unittest.TestCase):

    def test_valido(self):
      self.assertTrue(dna_valido(""))
      validas = "AAA AGGC GGGG TTT CCC GT TGC ATGA TCGA".split()
      invalidas = "AXA TGCHA YRT GITA AZ".split()
      for s in validas:
        self.assertTrue(dna_valido(s), f'valido {s}')
      for s in invalidas:
        self.assertFalse(dna_valido(s), f'invalido {s}')
    
    def test_get_orf_prot(self):
        testes = [
            ("YTMAMOP_NKENDRWNSKSSHQVFNKMENDRWNSMYIKWRSLFCWYPHKNVLIILQWLALDFWGCEFI_", {"MAMOP_", "MENDRWNSMYIKWRSLFCWYPHKNVLIILQWLALDFWGCEFI_"}),
            ("NKM_POPMENDMWYP_YTMOP", {"M_","MENDMWYP_"}),
            ("YMTYOP", set()),
            ("YMTYOP_I",{"MTYOP_"})
        ]
        for s, p in testes:
          self.assertEqual(get_orf_prot(s), p)

    def test_get_orfs(self):
      self.assertEqual(get_orfs("ACGTACTGCACGTA"),['TYCT', 'RTAR', 'VLHV', 'YVQY', 'TCST', 'RAVR'])

    def test_get_prots(self):
      dna = "TTATGGGTATGGCAACTTTCCATTAATGATATGTAACTAAGTGTACCACATATACCTAGCCAAAGGCTGTGTAGTGGAGCATCGTTATCCGAAGCTTGATCGTTGCATTGTGGTAAACC"
      self.assertEqual(get_prots(dna), set("MGMATFH_ M_ MQRSSFG_ MLHYTAFG_ MWYT_ MICN_".split()), get_orfs(dna))


unittest.main(argv=[''], exit=False)