In [6]:
import time
import tracemalloc
import csv

In [7]:
UP = (-1,0)
LEFT = (0, -1)
TOPLEFT = (-1, -1)
ORIGIN = (0, 0)

def traceback_global(v, w, pointers):
  i,j = len(v), len(w)
  new_v = []
  new_w = []
  while True:
      di, dj = pointers[i][j]
      if (di,dj) == LEFT:
          new_v.append('-')
          new_w.append(w[j-1])
      elif (di,dj) == UP:
          new_v.append(v[i-1])
          new_w.append('-')
      elif (di,dj) == TOPLEFT:
          new_v.append(v[i-1])
          new_w.append(w[j-1])
      i, j = i + di, j + dj
      if (i <= 0 and j <= 0):
          break
  return ''.join(new_v[::-1])+'\n'+''.join(new_w[::-1])



def global_align(v, w, delta):
  """
  Returns the score of the maximum scoring alignment of the strings v and w, as well as the actual alignment as
  computed by traceback_global.

  :param: v
  :param: w
  :param: delta
  """
  start_time = time.time()
  tracemalloc.start()
  
  M = [[0 for j in range(len(w)+1)] for i in range(len(v)+1)]
  pointers = [[ORIGIN for j in range(len(w)+1)] for i in range(len(v)+1)]
  score, alignment = None, None

  # YOUR CODE HERE
  M[0][0] = 0;
  for i in range(1, len(v) + 1):
    cur_v = v[i - 1]
    M[i][0] = M[i - 1][0] + delta[cur_v]["-"]
    pointers[i][0] = UP

  for i in range(1, len(w) + 1):
    cur_w = w[i - 1]
    M[0][i] = M[0][i - 1] + delta["-"][cur_w]
    pointers[0][i] = LEFT

  for i in range(1, len(v) + 1):
    for j in range(1, len(w) + 1):
      cur_v = v[i - 1]
      cur_w = w[j - 1]

      left_score = M[i][j - 1] + delta["-"][cur_w]
      top_score = M[i - 1][j] + delta[cur_v]["-"]

      M[i][j] = max(left_score, M[i - 1][j - 1] + delta[cur_v][cur_w], top_score)

      if M[i][j] == left_score:
        pointers[i][j] = LEFT
      elif M[i][j] == top_score:
        pointers[i][j] = UP
      else:
        pointers[i][j] = TOPLEFT

  score = M[len(v)][len(w)]

  alignment = traceback_global(v,w, pointers)

  # Stop memory tracking and calculate time elapsed
  end_time = time.time()
  current, peak = tracemalloc.get_traced_memory()
  tracemalloc.stop()
  elapsed_time = end_time - start_time

  print(f"Time taken: {elapsed_time:.4f} seconds")
  print(f"Current memory usage: {current / 10**6:.2f} MB")
  print(f"Peak memory usage: {peak / 10**6:.2f} MB")

  return alignment, score, elapsed_time, current, peak

In [8]:
keys = ['A', 'C', 'T', 'G', '-']
delta = {}
for i in range(len(keys)):
    delta[keys[i]] = {k : v for (k,v) in zip(keys, [1 if keys[i] == keys[j]  else -1 for j in range(len(keys))])}

global_align("TAGATA", "GTAGGCTTAAGGTTA", delta)
print("keys")

Time taken: 0.0000 seconds
Current memory usage: 0.00 MB
Peak memory usage: 0.00 MB
keys


In [10]:
def test_global_align(csv_file, delta):
    # Load the sequences from the CSV file
    sequences = []
    with open(csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            sequences.append((row['sequence1'], row['sequence2']))
    
    # Test on the first 10 rows
    for idx, (seq1, seq2) in enumerate(sequences[:10]):
        writer = csv.writer('alignment_results.csv')
        # Write header row
        writer.writerow(['Sequences', 'Score', 'Elapsed Time (s)', 'Current Memory (MB)', 'Peak Memory (MB)'])
        
        print(f"Test {idx + 1}:")
        aligned_seq, score, elapsed_time, current_mem, peak_mem = global_align(seq1, seq2, delta)

        current_memory = current_mem / 10**6  # Convert bytes to MB
        peak_memory = peak_mem / 10**6  # Convert bytes to MB

        print(f"Aligned Seq: {aligned_seq}")
        print(f"Score: {score}")
        print()

        # Write to CSV
        writer.writerow([aligned_seq, score, elapsed_time, current_memory, peak_memory])


In [11]:
if __name__ == "__main__":
    # Define the scoring matrix (delta)
    keys = ['A', 'C', 'T', 'G', '-']
    delta = {}
    for i in range(len(keys)):
        delta[keys[i]] = {k : v for (k,v) in zip(keys, [1 if keys[i] == keys[j] else -1 for j in range(len(keys))])}
    
    # Replace with the path to your CSV file
    csv_file = "sequence_pairs.csv"
    test_global_align(csv_file, delta)

FileNotFoundError: [Errno 2] No such file or directory: 'sequence_pairs.csv'

In [None]:
#forward function to get prefix
def get_prefix(v, w, delta):
    #same process as global but different output
    M = [[0 for j in range(len(w)+1)] for i in range(len(v)+1)]
    M[0][0] = 0;
    for i in range(1, len(v) + 1):
        cur_v = v[i - 1]
        M[i][0] = M[i - 1][0] + delta[cur_v]["-"]

    for j in range(0, len(w) + 1):
        for i in range(0, len(v) + 1):
            if (i > 0 and j > 0):
                cur_v = v[i - 1]
                cur_w = w[j - 1]
                
                left_score = M[i][j - 1] + delta["-"][cur_w]
                top_score = M[i - 1][j] + delta[cur_v]["-"]
                
                M[i][j] = max(left_score, M[i - 1][j - 1] + delta[cur_v][cur_w], top_score)
            #clear value from column no longer in use
            if (j >= 2):
                M[i][j - 2] = None
        #clear column from memory so only two stored at a time 
    columns = list(zip(*M))
    #print(M)
    return columns[len(w)]


#backward function to get suffix 
def get_suffix(v, w, delta): 
    #same process but reversing edge direction 
    M = [[0 for j in range(len(w)+1)] for i in range(len(v)+1)]
    M[-1][-1] = 0;
    for i in range(len(v) - 1, -1, -1):
        cur_v = v[i - 1]
        M[i][len(w)] = M[i + 1][0] + delta[cur_v]["-"]

    for j in range(len(w), -1, -1):
        for i in range(len(v), -1, -1):
            if (j < len(w) and i < len(v)):
                cur_v = v[i - 1]
                cur_w = w[j - 1]
                
                left_score = M[i][j + 1] + delta["-"][cur_w]
                top_score = M[i + 1][j] + delta[cur_v]["-"]
                
                M[i][j] = max(left_score, M[i + 1][j + 1] + delta[cur_v][cur_w], top_score)
            #clear value from column no longer in use
            if (j + 2 <= len(w)):
                M[i][j + 2] = None
    
    columns = list(zip(*M))
    #print(M)
    return columns[0]

#put into other function and have it return a new node in tree 
#hischberg recurance
def hirschberg(v, w, delta, i, j, i_prime, j_prime):
    if (j_prime - j <= 1):
        #base case, just get aligment with global alignment? 
        return []
    else:
        v_split = (int)(i_prime/2)
        w_split = (int)(j_prime/2)
        #print(v_split, w_split)
        weights = get_prefix(v[i : v_split], w[j : w_split], delta) + get_suffix(v[i : v_split], w[j : w_split], delta)
        i_star = weights.index(max(weights))
        j_split = (int)(j+((j_prime - j)/2))
        left = hirschberg(v, w, delta, i, j, i_star, j_split)
        right = hirschberg(v, w, delta, i_star, j_split, i_prime, j_prime)
        return [(i_star, j_split), left, right]
        

In [210]:
keys = ['A', 'C', 'T', 'G', '-']
delta = {}
for i in range(len(keys)):
    delta[keys[i]] = {k : v for (k,v) in zip(keys, [1 if keys[i] == keys[j]  else -1 for j in range(len(keys))])}
    
hirschberg("TAGATA", "GTAGGCTTAAGGTTA", delta, 0, 0, 6, 15)

3 7
[[None, None, None, None, None, None, 0, 0], [None, None, None, None, None, None, -1, 1], [None, None, None, None, None, None, -1, 0], [None, None, None, None, None, None, 1, 0]]
[[2, 3, None, None, None, None, None, None], [0, 1, None, None, None, None, None, None], [-1, -1, None, None, None, None, None, None], [0, 0, None, None, None, None, None, None]]
2 3
[[None, None, 0, 0], [None, None, 1, 0], [None, None, 0, 2]]
[[1, 0, None, None], [-1, 0, None, None], [0, 0, None, None]]
1 1
[[0, 0], [-1, -1]]
[[-1, -1], [0, 0]]
1 1
[[0], [-1]]
[[-1], [0]]
2 3
[[0]]
[[0]]
0 2
[[0]]
[[0]]
2 3
[[0], [-1], [-2]]
[[-2], [-1], [0]]
3 7
[[0]]
[[0]]
0 5
[[0]]
[[0]]
0 4
[[0]]
[[0]]
0 5
[[0]]
[[0]]
3 7
[[0], [-1], [-2], [-3]]
[[-3], [-2], [-1], [0]]
0 6
[[0]]
[[0]]
3 7
[[0], [-1], [-2], [-3]]
[[-3], [-2], [-1], [0]]
