In [None]:
>>> query = "AATATAT"
>>> seq = "AATATGTTATATAATAATATTT"
>>> w = 3
>>> qm = query_map(query, w)
>>> qm
{'AAT': [0], 'ATA': [1, 3], 'TAT': [2, 4]}
>>> hits(qm, seq)
[(0, 0), (0, 12), (0, 15), (1, 1), (1, 8), (1, 10), (1, 13), (1, 16),
(3, 1), (3, 8), (3, 10), (3, 13), (3, 16), (2, 2), (2, 7), (2, 9),
(2, 17), (4, 2), (4, 7), (4, 9), (4, 17)]
>>> extend_hit(query, seq, (1, 16), 3)
(0, 15, 7, 6)
>>> best_hit(query, seq, 3)
(0, 0, 7, 6)

In [2]:
query = "AATATAT"
seq = "AATATGTTATATAATAATATTT"
w = 3

In [5]:
def query_map(query, w):
  """
  query: query string
  w: window size

  return dict with keys as subseqs of w size and values as the offsets
  """
  tam = len(query)
  res = {}
  for chave, offset in [(query[p: p + w], p) for p in range(0, tam - w + 1)]:
    if chave not in res: res[chave] = []
    res[chave].append(offset)
  return res

def get_all_offsets(s1, s2):
  """
  return all offsets of s1 in s2
  """
  w = len(s1)
  #return [p for p in range(0, len(s2) - w + 1) if s2[p : p + w] == s1]
  res = []
  for p in range(0, len(s2) - w + 1):
    if s2[p : p + w] == s1:
      res.append(p)
  return res

def hits(qm, seq):
  """
  qm: <- query_map(query, w)
  seq: is the sequence where to look

  returns List[hits] where hit = (o1, o2)
  o1 is offset in query
  o2 is offset in seq
  """
  res = []
  for chave, offsets in qm.items():
    for o_query in offsets:
      for o_seq in get_all_offsets(chave, seq):
        res.append((o_query, o_seq))
  return res

def extend_hit_dir(query, seq, o1, o2, direction):
  """
  query: query string
  seq: sequence
  hit: (offset in query, offset in seq)
  w: window size
  direction: -1 or +1

  returns (o1, o2, matches, count)
  """
  matches = 0
  count = 0
  while o1 >= 0 and o2 >= 0 and o1 < len(query) and o2 < len(seq):
    matches += 1 if query[o1] == seq[o2] else 0
    count += 1
    if 2 * matches < count:
      return o1, o2, matches, count
    o1 += direction
    o2 += direction
  return o1 - direction, o2 - direction, matches, count

def extend_hit(query, seq, hit, w):
  """
  query: query string
  seq: sequence
  hit: (offset in query, offset in seq)
  w: window size

  returns (o1, o2, size, matches)

  size is the total size of the expansion
   
  """
  o1, o2 = hit
  left  = extend_hit_dir(query, seq, o1 - 1, o2 - 1, -1)
  right = extend_hit_dir(query, seq, o1 + w, o2 + w, +1)

  """
  O for offset
  M for match
  S for size
  """
  O1, O2, ML, SL = left
  _,   _, MR, SR = right

  print({'ML':ML, 'SR':SR, 'SL':SL,'MR':MR})
  return O1, O2, w + SL + SR, ML + w + MR


qm = query_map(query, 3)
hits(qm, seq)
extend_hit(query, seq, (1, 16), 3)

{'ML': 1, 'SR': 3, 'SL': 1, 'MR': 2}


(0, 15, 7, 6)