In [2]:
import time
import tracemalloc
import csv

In [7]:
UP = (-1,0)
LEFT = (0, -1)
TOPLEFT = (-1, -1)
ORIGIN = (0, 0)

def traceback_global(v, w, pointers):
  i,j = len(v), len(w)
  new_v = []
  new_w = []
  while True:
      di, dj = pointers[i][j]
      if (di,dj) == LEFT:
          new_v.append('-')
          new_w.append(w[j-1])
      elif (di,dj) == UP:
          new_v.append(v[i-1])
          new_w.append('-')
      elif (di,dj) == TOPLEFT:
          new_v.append(v[i-1])
          new_w.append(w[j-1])
      i, j = i + di, j + dj
      if (i <= 0 and j <= 0):
          break
  return ''.join(new_v[::-1])+'\n'+''.join(new_w[::-1])



def global_align(v, w, delta):
  """
  Returns the score of the maximum scoring alignment of the strings v and w, as well as the actual alignment as
  computed by traceback_global.

  :param: v
  :param: w
  :param: delta
  """
  start_time = time.time()
  tracemalloc.start()
  
  M = [[0 for j in range(len(w)+1)] for i in range(len(v)+1)]
  pointers = [[ORIGIN for j in range(len(w)+1)] for i in range(len(v)+1)]
  score, alignment = None, None

  # YOUR CODE HERE
  M[0][0] = 0;
  for i in range(1, len(v) + 1):
    cur_v = v[i - 1]
    M[i][0] = M[i - 1][0] + delta[cur_v]["-"]
    pointers[i][0] = UP

  for i in range(1, len(w) + 1):
    cur_w = w[i - 1]
    M[0][i] = M[0][i - 1] + delta["-"][cur_w]
    pointers[0][i] = LEFT

  for i in range(1, len(v) + 1):
    for j in range(1, len(w) + 1):
      cur_v = v[i - 1]
      cur_w = w[j - 1]

      left_score = M[i][j - 1] + delta["-"][cur_w]
      top_score = M[i - 1][j] + delta[cur_v]["-"]

      M[i][j] = max(left_score, M[i - 1][j - 1] + delta[cur_v][cur_w], top_score)

      if M[i][j] == left_score:
        pointers[i][j] = LEFT
      elif M[i][j] == top_score:
        pointers[i][j] = UP
      else:
        pointers[i][j] = TOPLEFT

  score = M[len(v)][len(w)]

  alignment = traceback_global(v,w, pointers)

  # Stop memory tracking and calculate time elapsed
  end_time = time.time()
  current, peak = tracemalloc.get_traced_memory()
  tracemalloc.stop()
  elapsed_time = end_time - start_time

  print(f"Time taken: {elapsed_time:.4f} seconds")
  print(f"Current memory usage: {current / 10**6:.2f} MB")
  print(f"Peak memory usage: {peak / 10**6:.2f} MB")

  return alignment, score, elapsed_time, current, peak

In [8]:
keys = ['A', 'C', 'T', 'G', '-']
delta = {}
for i in range(len(keys)):
    delta[keys[i]] = {k : v for (k,v) in zip(keys, [1 if keys[i] == keys[j]  else -1 for j in range(len(keys))])}

global_align("TAGATA", "GTAGGCTTAAGGTTA", delta)
print("keys")

Time taken: 0.0003 seconds
Current memory usage: 0.01 MB
Peak memory usage: 0.01 MB
keys


In [12]:
def test_global_align(csv_file, delta):
    # Load the sequences from the CSV file
    sequences = []
    with open(csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            sequences.append((row['sequence1'], row['sequence2']))
    
    # Test on the first 10 rows
    for idx, (seq1, seq2) in enumerate(sequences[:10]):
        writer = csv.writer('alignment_results.csv')
        # Write header row
        writer.writerow(['Sequences', 'Score', 'Elapsed Time (s)', 'Current Memory (MB)', 'Peak Memory (MB)'])
        
        print(f"Test {idx + 1}:")
        aligned_seq, score, elapsed_time, current_mem, peak_mem = global_align(seq1, seq2, delta)

        current_memory = current_mem / 10**6  # Convert bytes to MB
        peak_memory = peak_mem / 10**6  # Convert bytes to MB

        print(f"Aligned Seq: {aligned_seq}")
        print(f"Score: {score}")
        print()

        # Write to CSV
        writer.writerow([aligned_seq, score, elapsed_time, current_memory, peak_memory])


In [13]:
if __name__ == "__main__":
    # Define the scoring matrix (delta)
    keys = ['A', 'C', 'T', 'G', '-']
    delta = {}
    for i in range(len(keys)):
        delta[keys[i]] = {k : v for (k,v) in zip(keys, [1 if keys[i] == keys[j] else -1 for j in range(len(keys))])}
    
    # Replace with the path to your CSV file
    csv_file = "sequence_pairs.csv"
    test_global_align(csv_file, delta)

Test 1:
Time taken: 0.0099 seconds
Current memory usage: 0.17 MB
Peak memory usage: 0.18 MB
Aligned Seq: AAGAAAG-TTAGAT-ATCAG-ATGTGCTG-TT-AA-AGTGTTTGGTACGGGAAAATGTATAGGCGAGCTAGCCT--GCATAA
AA-AAAGGTTAGATGAGAAGCA-GTAATCCTTGAACA-TGATAGGAAA---AAAA-GAA-ATGA-A-CT-GACTCAGAAAAA
Score: 14

Test 2:
Time taken: 0.0104 seconds
Current memory usage: 0.22 MB
Peak memory usage: 0.22 MB
Aligned Seq: AA-GAAAGTTAG--ATATCAGATGTGCTG---TTA-AAGTG-TT-T-GGTACGGGAA----AATGTATAGGCGA-GCTAGCCT-GCAT-AA
CGTGAAA-TTCGGGATA-CAGA-GT-CAGGCATTCGAAG-GATTATTGGTAGGGGGGCGCTAA-G-ATAAGCGATGAT-G--TAGTTTTAA
Score: 13

Test 3:
Time taken: 0.0083 seconds
Current memory usage: 0.20 MB
Peak memory usage: 0.20 MB
Aligned Seq: AAGAAAGTTA-GATA-TCA-G-ATGTG--C-TGTTAA--AG-TGTTTG-GTACGGGA-AAATG--TA--TA---G-GCGAGCTAGCCTGCATAA
--GAAA-TTATG-TGGTAAAGTAAGCGAACCTGTCACGGAGATGAT-GAGAA-GG-ATAAA-GAATAAATAAAAGTG-G-G--AGC--G-AGA-
Score: 4

Test 4:
Time taken: 0.0097 seconds
Current memory usage: 0.25 MB
Peak memory usage: 0.25 MB
Aligned Seq: --AA----