In [1]:
#%load_ext memory_profiler #not required, just to check how much memory the program uses

In [2]:
def hamming_distance(str1, str2) -> int:
    if len(str1) != len(str2):
        raise ValueError("Input strings must have the same length")

    distance = 0
    str1 = str1.upper()
    str2 = str2.upper()
    for i in range(len(str1)):
        if str1[i] == 'N': # character N does not match with anything
            distance += 1
        elif str1[i] != str2[i]:
            distance += 1
    return distance


In [3]:
from Bio import SeqIO
import logging
import math
import os
import random
from typing import List, Tuple, Union

def synthesize_sequence(
    s_length: int,
    n_repeats: int,
    repeat_length: Union[int, Tuple[int, int]],
    repeat_coverage: float,
    repeat_noise: int,
    log: str = None
) -> List[str]:
    '''
    Generate a randomized nucleotide sequence with the given parameters.

    Arguments:
        s_length (int): Length of the sequence to be generated.
        n_repeats (int): Number of unique repeats. These repeats will be randomly
            scattered around the generated sequence.
        repeat_length (int or tuple[int, int]): Value, or min and max values for the length of the repeats.
        If a tuple if provided, the repeats will be randomized within this range.
        repeat_coverage ([0, 1]): The proportion of the finals sequence that should be populated
            with repeats. Note that this is a lower bound: The randomized regions may have repeats
            by chance.
        repeat_noise (int): Maximum Hamming distance to randomize repeats with. Once a repeat is
            decided, the sequence will be populated with its variants that are at most this distance
            away from that repeat.
        log (str): If provided, will output a log of the process to the provided path.
    '''
    print("Creating string",s_length,n_repeats,repeat_length,repeat_coverage,repeat_noise,log)
    verbose = False
    if log is not None:
        verbose = True
        f = open(log, 'w')
        f.write('Generating baits with following arguments:\n')
        f.write('L = {}\n'.format(s_length))
        f.write('RN (repeat number) = {}\n'.format(n_repeats))
        f.write('RL (repeat length) = {}\n'.format(repeat_length))
        f.write('RC (repeat coverage) = {}\n'.format(repeat_coverage))
        f.write('RE (repeat error) = {}\n'.format(repeat_noise))

    BASES = ['A', 'G', 'T', 'C']
    vec = [None] * s_length
    total_populated = 0
    repeats_pools = {}
    if verbose:
        repeats_locations = {}
    for i in range(n_repeats):
        l = repeat_length
        if type(repeat_length) is tuple:
            l = random.randint(repeat_length[0], repeat_length[1])
        repeat = ''.join(random.choices(BASES, k = l))
        while repeat in repeats_pools.keys():
            repeat = ''.join(random.choices(BASES, k = l))
        repeats_pools[repeat] = set(range(s_length - l + 1))
        if verbose:
            repeats_locations[repeat] = []
    repeat_coverage = repeat_coverage * s_length
    repeats = list(repeats_pools.keys())
    repeats_ctr = 0
    while total_populated < repeat_coverage:
        repeat = repeats[repeats_ctr]
        i = -1
        if repeat_coverage == s_length:
            i = total_populated # if entire sequence is repeats, fill the next index
        elif len(repeats_pools[repeat]) == 0:
            if verbose:
                f.write('WARNING: {} has no more indices it can be planted in.\n'.format(repeat))
            repeats.remove(repeat)
            n_repeats -= 1
            repeats_ctr = repeats_ctr % n_repeats
            continue
        else:
            i = random.choice(list(repeats_pools[repeat]))
        if i + len(repeat) > s_length:
            if repeat_coverage == s_length:
                vec.extend([None] * (i + len(repeat) - s_length)) # extend vector to accomodate this plant
                if verbose:
                    f.write('Extended sequence to {} base pairs to add final repeat.'.format(len(vec)))
            else:
                continue
        for j in range(len(repeat)):
            if vec[i + j] is not None:
                raise Exception('this shouldnt happen')
        if verbose:
            repeats_locations[repeat].append(i)
        n_modifs = random.randint(0, repeat_noise)
        modif_locs = random.choices(range(len(repeat)), k = n_modifs)
        for j in range(len(repeat)):
            if j in modif_locs:
                vec[i + j] = random.choice([x for x in BASES if x != repeat[j]])
            else:
                vec[i + j] = repeat[j]
        for other in repeats:
            for j in range(i - len(other) + 1, i + len(repeat)):
                if j in repeats_pools[other]:
                    repeats_pools[other].discard(j)
        repeats_ctr = (repeats_ctr + 1) % n_repeats
        total_populated = len(vec) - vec.count(None)
    if verbose:
        f.write('Total covered base pairs: {}\n'.format(total_populated))
        for k, v in repeats_locations.items():
            if len(v) == 0 or len(v) == 1:
                print('WARNING: Some repeats were planted < 2 times')
                f.write('WARNING: Some repeats were planted < 2 times\n')
                break
        f.write('Repeats and the locations they were planted in:\n')
        for k, v in repeats_locations.items():
            f.write(str(k) +  ': ' + str(v) + '\n')
    for i in range(s_length):
        if vec[i] is None:
            vec[i] = random.choice(BASES)
    return ''.join(vec)

In [4]:
import subprocess
import time

In [5]:
from typing import List, Union, Tuple
import numpy as np
def initialize_ignore_vector(
  seqlens: List[int],
  l: int,
) -> List[bool]:
  '''
  Given a list of sequence lengths, create an ignore vector where the indices that would
  cover concatenation spots are marked. Concatenation spots cannot be aligned to with baits,
  so they never need to be checked for alignments and can be ignored.
  '''
  ignore = [False] * sum(seqlens)
  seqlen_sum = 0
  for seqlen in seqlens:
    seqlen_sum += seqlen
    for i in range(seqlen_sum - l + 1, seqlen_sum):
      ignore[i] = True
  return ignore

def update_ignore_vector(
  cov: List[bool],
  seqlens: List[int],
  ignore: List[bool],
  region: Tuple[int, int],
  l: int
) -> None:
  '''
  Given a coverage vector, an ignore vector, and a covered region, find the indices that
  no longer need to be checked for alignments and update the ignore vector accordingly. An
  index can be ignored if the index and the l - 1 indices following it are already covered,
  provided that none of those indices are concatenation spots.
  '''
  total_seqlen = len(cov)
  current_seqstart = 0
  current_seqend = 0
  seqlens_ctr = -1

  check_start = region[0] - l + 1
  if check_start < 0:
    check_start = 0
  check_end = region[1] + l
  if check_end > total_seqlen:
    check_end = total_seqlen

  streak = 0 # how many indices in a row are already covered.
  for i in range(check_start, check_end):
    while i >= current_seqend:
      current_seqstart = current_seqend
      seqlens_ctr += 1
      current_seqend += seqlens[seqlens_ctr]
      streak = 0 # if we cross a concatenation spot, streak is back to zero
    if cov[i]:
      streak += 1
    else:
      streak = 0
    if streak >= l:
      ignore[i - l + 1] = True

def calculate_seqlens(seqs: List[str]) -> List[int]:
  return [len(seq) for seq in seqs]

def calculate_coverage(subs, l) -> List[Tuple[int, int]]:
  '''
  Given a list of substring starting indices and substring length, return an actual list of starting and ending indices for the coverage of these substrings.
  Example: suppose we have subs = [5, 10, 15, 45], l = 10. These cover from [5, 25] and [45, 55]
  '''
  if len(subs) == 0:
    return []
  subs = sorted(subs)
  results = []
  curr_start = subs[0]
  curr_end = subs[0] + l
  for sub in subs[1:]:
    if sub <= curr_end: # if this index is covered by the last started interval, extend the interval
      curr_end = sub + l
    else: # otherwise, end last one and start new one
      results.append((curr_start, curr_end))
      curr_start = sub
      curr_end = sub + l
  results.append((curr_start, curr_end)) # end last interval
  return results

def naive_alignment(
  bait: str,
  s_storage: np.ndarray,
  d: int,
  ignore: List[bool]
) -> List[int]:
  bait = np.array(list(bait))
  l = len(bait)
  result = []
  for i in range(len(s_storage)):
    if ignore[i]:
      continue
    distance = l - (s_storage[i, :] == bait).sum()
    if distance <= d:
      result.append(i)
  return result

def verify_baits(
  baits: List[str],
  s: Union[str, List[str]],
  d: int,
  log: str = None,
):
  if log is not None:
    verbose = True
    f = open(log, 'w')
    f.write('Verifying baits with provided arguments:\n')
    f.write('d (mismatch allowance) = {}\n'.format(d))
    f.write('--------\n')
  else:
    verbose = False

  if isinstance(s, list):
    seqlens = calculate_seqlens(s)
    s = ''.join(s)
  else:
    seqlens = [len(s)]

  length = len(s)
  cov = [False] * length
  l = len(baits[0])
  ignore = initialize_ignore_vector(seqlens, l)
  if verbose:
    f.write('Initialized integer array and ignore vector with length {}\n'.format(length))

  ids = ['bait#{}'.format(str(i)) for i in range(len(baits))]

  s_storage = np.empty((length - l + 1, l), dtype = str)
  s = np.array(list(s))
  for i in range(length - l + 1):
    s_storage[i, :] = s[i: i + l]
  for id, bait in zip(ids, baits):
    if verbose:
      f.write('Aligning bait {}.\n'.format(id))
    matches = naive_alignment(bait, s_storage, d, ignore)
    coverages = calculate_coverage(matches, l)
    if verbose:
      f.write('Bait covers between:\n {}\n'.format(coverages))
    for c in coverages:
      for j in range(c[0], c[1]):
        cov[j] = True
  if verbose:
    f.write('--------\n')
    f.write('Remaining uncovered indices: {}.\n'.format(cov.count(False)))
    f.write(str([i for i in range(len(cov)) if not cov[i]]))
    f.close()
  return [i for i in range(len(cov)) if not cov[i]]
import random
def similarity_index(
  s: Union[str, List[str]],
  d: int,
  trials: int,
):
  if isinstance(s, list):
    seqlens = calculate_seqlens(s)
    s = ''.join(s)
  else:
    seqlens = [len(s)]

  length = len(s)
  cov = [False] * length
  l=120
  ignore = initialize_ignore_vector(seqlens, l)

  s_storage = np.empty((length - l + 1, l), dtype = str)
  s = np.array(list(s))
  for i in range(length - l + 1):
    s_storage[i, :] = s[i: i + l]
  trial_counter=trials
  similarities=0
  while trial_counter > 0:
    trial_counter-=1;
    random_number1=random.randint(0,len(ignore)-1)#inclusive for both ends
    random_number2=random.randint(0,len(ignore)-1)
    if random_number1==random_number2:
      trial_counter+=1
      continue
    if ignore[random_number1]==True or ignore[random_number2]==True:
      trial_counter+=1
      continue
    distance = l - (s_storage[random_number1, :] == s_storage[random_number2, :]).sum()
    if distance <= d:
      similarities+=1
  return similarities/trials

In [6]:
def diagnosis(maxRadius,trials,inputFile):
    start_time=time.time()
    with open(inputFile, 'r') as file:
        original = [line.strip() for line in file.readlines()]
    print("Similarity Index:",similarity_index(original,maxRadius,trials))
    end_time = time.time()
    duration = end_time - start_time
    print(f"Diagnosis time for {inputFile}: {duration} seconds")
    return

def run(windowLength,maxRadius,lenientRadius,lenientRadius2,overlapCount,bypassHyperparameter,searchBreadths,inputFile,outputFile,verification):
    #NOTE: FILE PATHS MUST NOT HAVE SPACES
    inp=str(windowLength)+"\n"+str(maxRadius)+"\n"+str(lenientRadius)+"\n"+str(lenientRadius2)+"\n"+str(overlapCount)+"\n"+str(bypassHyperparameter)+"\n"
    inp+=str(len(searchBreadths))+"\n"
    for sb in searchBreadths:
        inp+=str(sb)+"\n"
    inp+=inputFile+"\n"
    inp+=outputFile
    cleaned_string = inp.replace("\n", " ")
    print(cleaned_string)

    start_time = time.time()
    os.chmod(inputFile, 0o777)
    process=subprocess.Popen("./MultithreadedGenerativeSearchV4WithInput.exe",
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
                         text=True)
    stdout, stderr = process.communicate(input=inp)

    end_time = time.time()
    duration = end_time - start_time
    print(f"Execution time for {inputFile}: {duration} seconds")
    rt=duration
    print("Errors (should be blank):", stderr)
    #print("inputFile: ",inputFile)
    with open(inputFile, 'r') as file:
        original = [line.strip() for line in file.readlines()]
    with open(outputFile, 'r') as file:
        baits = [line.strip() for line in file.readlines()]
    #baits=baits[0:-1] to test true negative aka a bait list that doesnt cover
    print("Number of baits:", len(baits))
    if verification:
        start_time = time.time()
        missing_spots=verify_baits(baits,original,40)
        if len(missing_spots)==0:
            print("\033[92m", len(missing_spots), "spots uncovered\033[0m")          
        else:
            print("\033[91m", len(missing_spots), "spots uncovered\033[0m") 
        end_time = time.time()
        duration = end_time - start_time
        print(f"Verification time for {inputFile}: {duration} seconds")
    return [len(baits),rt]

In [7]:
def tune_params(time,windowLength,maxRadius,inputFile,outputFile,verification):
    #verifcation only applies to final run, not test
    lenientRadius=80
    lenientRadius2=80
    #overlapCount=[1,2,3,4,5,6,8,10,12,15,20,24,30,40,60,90,120]
    #overlapCount=[1,3,6,10,15,24,40,60]
    overlapCount=[1,6,15,40]
    #overlapCount=[1,2,3,4,5,6,8,10,12,15,20,24,30,40,60,90,120]
    bypassHyperparameter=40
    searchBreadths=[[-1,2,3,4],[-1,3,4,5]] #[3,4,5,2] (best for actual clustering) vs [10,10] (best for the overlap count)
    if time<0:
        best_oc=0
        oc_bait=999999999999999999999999999999999999999999999999999
        for oc in range(len(overlapCount)):
            results=run(windowLength,maxRadius,lenientRadius,lenientRadius2,overlapCount[oc],bypassHyperparameter,searchBreadths[0],inputFile,outputFile,False)
            if results[0]<oc_bait:
                best_oc=oc
                oc_bait=results[0]
            else:
                break
        print("best params",windowLength,maxRadius,lenientRadius,lenientRadius2,overlapCount[best_oc],bypassHyperparameter,searchBreadths[1],inputFile,outputFile,verification)
        print("final run")
        return [run(windowLength,maxRadius,lenientRadius,lenientRadius2,overlapCount[best_oc],bypassHyperparameter,searchBreadths[1],inputFile,outputFile,verification),[windowLength,maxRadius,lenientRadius,lenientRadius2,overlapCount[oc],bypassHyperparameter,searchBreadths[1],inputFile,outputFile,verification]]

In [8]:
synthetic100="synthetic_120_240_nospace\RL=120_240\L=250000\RN=62\RC=1\L250000_RL(120,240)_RC100_RE40_no0.fasta"
synthetic0="synthetic_120_240_nospace\RL=120_240\L=250000\RN=62\RC=0\L250000_RL(120,240)_RC0_RE40_no0.fasta"

synthetic100

In [10]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
#100% seed planted
diagnosis(40,1000000,synthetic100)

Similarity Index: 9.3e-05
Diagnosis time for synthetic_120_240_nospace\RL=120_240\L=250000\RN=62\RC=1\L250000_RL(120,240)_RC100_RE40_no0.fasta: 6.425724506378174 seconds


In [98]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
#100% seed planted
diagnosis(60,1000000,synthetic100)

Similarity Index: 8.6e-05
Diagnosis time for synthetic_120_240_nospace\RL=120_240\L=250000\RN=62\RC=1\L250000_RL(120,240)_RC100_RE40_no0.fasta: 13.936027526855469 seconds


In [93]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
#100% seed planted
diagnosis(70,1000000,synthetic100)

Similarity Index: 0.000131
Diagnosis time for synthetic_120_240_nospace\RL=120_240\L=250000\RN=62\RC=1\L250000_RL(120,240)_RC100_RE40_no0.fasta: 14.02512526512146 seconds


In [None]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
#100% seed planted
diagnosis(80,1000000,synthetic100)

synthetic0

In [None]:
with open(synthetic0, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
#all random 
diagnosis(40,1000000,synthetic0)

In [None]:
with open(synthetic0, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
#all random 
diagnosis(60,1000000,synthetic0)

In [None]:
with open(synthetic0, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
#100% seed planted
diagnosis(70,1000000,synthetic0)

In [10]:
with open(synthetic0, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
#all random 
run(120, 40, 80, 80, 120, 40, [5, 5, 5, 2], "input.txt", "output.txt", True)

120 40 80 80 120 40 4 5 5 5 2 input.txt output.txt


KeyboardInterrupt: 

In [9]:
import psutil
print("{} GB".format(psutil.virtual_memory().total / (1024.0 ** 3)))

13.855525970458984 GB


In [20]:
with open("synthetic_120_240_nospace\RL=120_240\L=250000\RN=62\RC=1\L250000_RL(120,240)_RC100_RE40_no4.fasta", 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 1, 40, [5, 5, 5, 2], "input.txt", "output.txt", False)

120 40 80 80 1 40 4 5 5 5 2 input.txt output.txt
Execution time for input.txt: 8.683635711669922 seconds
Errors (should be blank): 
Number of baits: 964


[964, 8.683635711669922]

In [None]:
%memit

In [11]:
with open("synthetic_120_240_nospace\RL=120_240\L=250000\RN=62\RC=1\L250000_RL(120,240)_RC100_RE40_no4.fasta", 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 10, 40, [5, 5, 5, 2], "input.txt", "output.txt", False)

120 40 80 80 10 40 4 5 5 5 2 input.txt output.txt


KeyboardInterrupt: 

In [25]:
%memit

peak memory: 224.59 MiB, increment: 0.00 MiB


In [2]:
#test
with open("synthetic_120_240_nospace_tuned/RL=120_240/L=500000/RN=166/RC=0.5/L500000_RL(120,240)_RC50_RE40_no1.fasta", 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 1, 40, [5, 5, 5, 2], "input.txt", "output.txt", False)

NameError: name 'run' is not defined

In [28]:
%memit

peak memory: 104.73 MiB, increment: 0.01 MiB


In [17]:
path='synthetic_120_240_nospace_tuned'
for item in sorted(os.listdir(path)):#sorted and actual file explorer break sort ties differently
    path2=path+'/'+item
    print(path2)
    for item2 in sorted(os.listdir(path2)):
        path3=path2+'/'+item2
        print(path3)
        if(item2!="L=500000"):
            continue
        for item3 in sorted(os.listdir(path3)):
            path4=path3+'/'+item3
            print(path4)
            for item4 in sorted(os.listdir(path4)):
                path5=path4+'/'+item4
                print(path5)
                bait_total=0
                time_total=0
                iter=0
                for item5 in sorted(os.listdir(path5)):
                    path6=path5+'/'+item5
                    print(path6)
                    base, extension=os.path.splitext(item5)
                    if extension=='.fasta':
                        print("File:",item5)
                        with open(path6, 'r') as file:
                            original = [line.strip() for line in file.readlines()]
                        with open("input.txt", 'w') as file:
                            file.write(original[1])
                        #op=tune_params(-1,120,40,"input.txt","output.txt",False)
                        #bait_total+=op[0][0]
                        #time_total+=op[0][1]
                        if item2=='L=500000' and iter==0:
                            op=run(120, 40, 80, 80, 1, 40, [10, 10, 10, 2], "input.txt", "output.txt", True)                       
                            bait_total+=op[0]
                            time_total+=op[1]
                        else:
                            op=run(120, 40, 80, 80, 1, 40, [10, 10, 10, 2], "input.txt", "output.txt", False)                       
                            bait_total+=op[0]
                            time_total+=op[1]                            
                        iter+=1
                        #run(120, 40, 80, 80, 1, 40, [5, 5, 5, 2], "input.txt", "output.txt", True)
                print("avg bait",bait_total/iter)
                print("avg time",time_total/iter)

synthetic_120_240_nospace_tuned/RL=120_240
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000
synthetic_120_240_nospace_tuned/RL=120_240/L=2000000
synthetic_120_240_nospace_tuned/RL=120_240/L=250000
synthetic_120_240_nospace_tuned/RL=120_240/L=500000
synthetic_120_240_nospace_tuned/RL=120_240/L=500000/RN=125
synthetic_120_240_nospace_tuned/RL=120_240/L=500000/RN=125/RC=0.5
synthetic_120_240_nospace_tuned/RL=120_240/L=500000/RN=125/RC=0.5/L500000_RL(120,240)_RC50_RE40_no0.fasta
File: L500000_RL(120,240)_RC50_RE40_no0.fasta
120 40 80 80 1 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 13.437566995620728 seconds
Errors (should be blank): 
Number of baits: 2057
[92m 0 spots uncovered[0m
Verification time for input.txt: 3918.012942314148 seconds
synthetic_120_240_nospace_tuned/RL=120_240/L=500000/RN=125/RC=0.5/L500000_RL(120,240)_RC50_RE40_no0.log
synthetic_120_240_nospace_tuned/RL=120_240/L=500000/RN=125/RC=0.5/L500000_RL(120,240)_RC50_RE40_no1.fasta
File: L500000_R

In [11]:
path='synthetic_120_240_nospace_tuned'
for item in sorted(os.listdir(path)):#sorted and actual file explorer break sort ties differently
    path2=path+'/'+item
    print(path2)
    for item2 in sorted(os.listdir(path2)):
        path3=path2+'/'+item2
        print(path3)
        # if(item2!="L=500000"):
        #     continue
        for item3 in sorted(os.listdir(path3)):
            path4=path3+'/'+item3
            print(path4)
            for item4 in sorted(os.listdir(path4)):
                path5=path4+'/'+item4
                print(path5)
                bait_total=0
                time_total=0
                iter=0
                for item5 in sorted(os.listdir(path5)):
                    path6=path5+'/'+item5
                    print(path6)
                    base, extension=os.path.splitext(item5)
                    if extension=='.fasta':
                        print("File:",item5)
                        with open(path6, 'r') as file:
                            original = [line.strip() for line in file.readlines()]
                        with open("input.txt", 'w') as file:
                            file.write(original[1])
                        #op=tune_params(-1,120,40,"input.txt","output.txt",False)
                        #bait_total+=op[0][0]
                        #time_total+=op[0][1]
                        op=run(120, 40, 80, 80, 5, 40, [10, 10, 10, 2], "input.txt", "output.txt", (path6=='synthetic_120_240_nospace_tuned/RL=120_240/L=500000/RN=166/RC=0.5/L500000_RL(120,240)_RC50_RE40_no1.fasta'))                       
                        bait_total+=op[0]
                        time_total+=op[1]
                     
                        iter+=1
                        if op[1]>1000:
                            break
                        #run(120, 40, 80, 80, 1, 40, [5, 5, 5, 2], "input.txt", "output.txt", True)
                print("avg bait",bait_total/iter)
                print("avg time",time_total/iter)

synthetic_120_240_nospace_tuned/RL=120_240
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no0.fasta
File: L1000000_RL(120,240)_RC50_RE40_no0.fasta
120 40 80 80 5 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 209.38311195373535 seconds
Errors (should be blank): 
Number of baits: 3491
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no0.log
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no1.fasta
File: L1000000_RL(120,240)_RC50_RE40_no1.fasta
120 40 80 80 5 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 209.486389875412 seconds
Errors (should be blank): 
Number of baits: 3496
synthetic_120_240_nospace_tuned/RL=120_240/L=10

In [12]:
path='synthetic_120_240_nospace_tuned'
for item in sorted(os.listdir(path)):#sorted and actual file explorer break sort ties differently
    path2=path+'/'+item
    print(path2)
    for item2 in sorted(os.listdir(path2)):
        path3=path2+'/'+item2
        print(path3)
        # if(item2!="L=500000"):
        #     continue
        for item3 in sorted(os.listdir(path3)):
            path4=path3+'/'+item3
            print(path4)
            for item4 in sorted(os.listdir(path4)):
                path5=path4+'/'+item4
                print(path5)
                bait_total=0
                time_total=0
                iter=0
                for item5 in sorted(os.listdir(path5)):
                    path6=path5+'/'+item5
                    print(path6)
                    base, extension=os.path.splitext(item5)
                    if extension=='.fasta':
                        print("File:",item5)
                        with open(path6, 'r') as file:
                            original = [line.strip() for line in file.readlines()]
                        with open("input.txt", 'w') as file:
                            file.write(original[1])
                        #op=tune_params(-1,120,40,"input.txt","output.txt",False)
                        #bait_total+=op[0][0]
                        #time_total+=op[0][1]
                        op=run(120, 40, 80, 80, 10, 40, [10, 10, 10, 2], "input.txt", "output.txt", (path6=='synthetic_120_240_nospace_tuned/RL=120_240/L=500000/RN=166/RC=0.5/L500000_RL(120,240)_RC50_RE40_no1.fasta'))                       
                        bait_total+=op[0]
                        time_total+=op[1]
                     
                        iter+=1
                        if op[1]>1000:
                            break
                        #run(120, 40, 80, 80, 1, 40, [5, 5, 5, 2], "input.txt", "output.txt", True)
                print("avg bait",bait_total/iter)
                print("avg time",time_total/iter)

synthetic_120_240_nospace_tuned/RL=120_240
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no0.fasta
File: L1000000_RL(120,240)_RC50_RE40_no0.fasta
120 40 80 80 10 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 798.1234881877899 seconds
Errors (should be blank): 
Number of baits: 3219
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no0.log
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no1.fasta
File: L1000000_RL(120,240)_RC50_RE40_no1.fasta
120 40 80 80 10 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 819.4956059455872 seconds
Errors (should be blank): 
Number of baits: 3227
synthetic_120_240_nospace_tuned/RL=120_240/L=

In [13]:
path='synthetic_120_240_nospace_tuned'
for item in sorted(os.listdir(path)):#sorted and actual file explorer break sort ties differently
    path2=path+'/'+item
    print(path2)
    for item2 in sorted(os.listdir(path2)):
        path3=path2+'/'+item2
        print(path3)
        # if(item2!="L=500000"):
        #     continue
        for item3 in sorted(os.listdir(path3)):
            path4=path3+'/'+item3
            print(path4)
            for item4 in sorted(os.listdir(path4)):
                path5=path4+'/'+item4
                print(path5)
                bait_total=0
                time_total=0
                iter=0
                for item5 in sorted(os.listdir(path5)):
                    path6=path5+'/'+item5
                    print(path6)
                    base, extension=os.path.splitext(item5)
                    if extension=='.fasta':
                        print("File:",item5)
                        with open(path6, 'r') as file:
                            original = [line.strip() for line in file.readlines()]
                        with open("input.txt", 'w') as file:
                            file.write(original[1])
                        #op=tune_params(-1,120,40,"input.txt","output.txt",False)
                        #bait_total+=op[0][0]
                        #time_total+=op[0][1]
                        op=run(120, 40, 80, 80, 10, 40, [10, 10, 10, 2], "input.txt", "output.txt", (path6=='synthetic_120_240_nospace_tuned/RL=120_240/L=500000/RN=166/RC=0.5/L500000_RL(120,240)_RC50_RE40_no0.fasta'))                       
                        bait_total+=op[0]
                        time_total+=op[1]
                     
                        iter+=1
                        if op[1]>1000:
                            break
                        #run(120, 40, 80, 80, 20, 40, [5, 5, 5, 2], "input.txt", "output.txt", True)
                print("avg bait",bait_total/iter)
                print("avg time",time_total/iter)

synthetic_120_240_nospace_tuned/RL=120_240
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no0.fasta
File: L1000000_RL(120,240)_RC50_RE40_no0.fasta
120 40 80 80 10 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 677.7933855056763 seconds
Errors (should be blank): 
Number of baits: 3216
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no0.log
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no1.fasta
File: L1000000_RL(120,240)_RC50_RE40_no1.fasta
120 40 80 80 10 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 709.6273775100708 seconds
Errors (should be blank): 
Number of baits: 3233
synthetic_120_240_nospace_tuned/RL=120_240/L=

In [9]:
path='synthetic_120_240_nospace_tuned'
for item in sorted(os.listdir(path)):#sorted and actual file explorer break sort ties differently
    path2=path+'/'+item
    print(path2)
    for item2 in sorted(os.listdir(path2)):
        path3=path2+'/'+item2
        print(path3)
        # if(item2!="L=500000"):
        #     continue
        for item3 in sorted(os.listdir(path3)):
            path4=path3+'/'+item3
            print(path4)
            for item4 in sorted(os.listdir(path4)):
                path5=path4+'/'+item4
                print(path5)
                bait_total=0
                time_total=0
                iter=0
                for item5 in sorted(os.listdir(path5)):
                    path6=path5+'/'+item5
                    print(path6)
                    base, extension=os.path.splitext(item5)
                    if extension=='.fasta':
                        print("File:",item5)
                        with open(path6, 'r') as file:
                            original = [line.strip() for line in file.readlines()]
                        with open("input.txt", 'w') as file:
                            file.write(original[1])
                        #op=tune_params(-1,120,40,"input.txt","output.txt",False)
                        #bait_total+=op[0][0]
                        #time_total+=op[0][1]
                        op=run(120, 40, 80, 80, 20, 40, [10, 10, 10, 2], "input.txt", "output.txt", (path6=='synthetic_120_240_nospace_tuned/RL=120_240/L=500000/RN=166/RC=0.5/L500000_RL(120,240)_RC50_RE40_no0.fasta'))                       
                        bait_total+=op[0]
                        time_total+=op[1]
                     
                        iter+=1
                        if op[1]>1000:
                            break
                        #run(120, 40, 80, 80, 20, 40, [5, 5, 5, 2], "input.txt", "output.txt", True)
                print("avg bait",bait_total/iter)
                print("avg time",time_total/iter)

synthetic_120_240_nospace_tuned/RL=120_240
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no0.fasta
File: L1000000_RL(120,240)_RC50_RE40_no0.fasta
120 40 80 80 20 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 1799.2306988239288 seconds
Errors (should be blank): 
Number of baits: 3154
avg bait 3154.0
avg time 1799.2306988239288
synthetic_120_240_nospace_tuned/RL=120_240/L=2000000
synthetic_120_240_nospace_tuned/RL=120_240/L=2000000/RN=666
synthetic_120_240_nospace_tuned/RL=120_240/L=2000000/RN=666/RC=0.5
synthetic_120_240_nospace_tuned/RL=120_240/L=2000000/RN=666/RC=0.5/L2000000_RL(120,240)_RC50_RE40_no0.fasta
File: L2000000_RL(120,240)_RC50_RE40_no0.fasta
120 40 80 80 20 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 6

In [11]:
path='synthetic_120_240_nospace_tuned'
for item in sorted(os.listdir(path)):#sorted and actual file explorer break sort ties differently
    path2=path+'/'+item
    print(path2)
    for item2 in sorted(os.listdir(path2)):
        path3=path2+'/'+item2
        print(path3)
        # if(item2!="L=500000"):
        #     continue
        for item3 in sorted(os.listdir(path3)):
            path4=path3+'/'+item3
            print(path4)
            for item4 in sorted(os.listdir(path4)):
                path5=path4+'/'+item4
                print(path5)
                bait_total=0
                time_total=0
                iter=0
                for item5 in sorted(os.listdir(path5)):
                    path6=path5+'/'+item5
                    print(path6)
                    base, extension=os.path.splitext(item5)
                    if extension=='.fasta':
                        print("File:",item5)
                        with open(path6, 'r') as file:
                            original = [line.strip() for line in file.readlines()]
                        with open("input.txt", 'w') as file:
                            file.write(original[1])
                        #op=tune_params(-1,120,40,"input.txt","output.txt",False)
                        #bait_total+=op[0][0]
                        #time_total+=op[0][1]
                        op=run(120, 40, 80, 80, 30, 40, [10, 10, 10, 2], "input.txt", "output.txt", (path6=='synthetic_120_240_nospace_tuned/RL=120_240/L=500000/RN=166/RC=0.5/L500000_RL(120,240)_RC50_RE40_no0.fasta'))                       
                        bait_total+=op[0]
                        time_total+=op[1]
                     
                        iter+=1
                        if op[1]>1000:
                            brea0
                print("avg bait",bait_total/iter)
                print("avg time",time_total/iter)

synthetic_120_240_nospace_tuned/RL=120_240
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no0.fasta
File: L1000000_RL(120,240)_RC50_RE40_no0.fasta
120 40 80 80 30 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 5352.105159044266 seconds
Errors (should be blank): 
Number of baits: 3209
avg bait 3209.0
avg time 5352.105159044266
synthetic_120_240_nospace_tuned/RL=120_240/L=2000000
synthetic_120_240_nospace_tuned/RL=120_240/L=2000000/RN=666
synthetic_120_240_nospace_tuned/RL=120_240/L=2000000/RN=666/RC=0.5
synthetic_120_240_nospace_tuned/RL=120_240/L=2000000/RN=666/RC=0.5/L2000000_RL(120,240)_RC50_RE40_no0.fasta
File: L2000000_RL(120,240)_RC50_RE40_no0.fasta
120 40 80 80 30 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 265

In [None]:
with open("BaitCoveringTesting\synthetic_120_240_nospace\RL=120_240\L=1000000\RN=250\RC=0\L1000000_RL(120,240)_RC0_RE40_no0.fasta", 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 1, 40, [-1, 5, 5, 2], "input.txt", "output.txt", False)

In [10]:
with open(synthetic0, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 1, 40, [-1, 5, 5, 2], "input.txt", "output.txt", False)

120 40 80 80 1 40 4 -1 5 5 2 input.txt output.txt
Execution time for input.txt: 10.432271003723145 seconds
Errors (should be blank): 
Number of baits: 1061


[1061, 10.432271003723145]

In [None]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 120, 40, [5, 5, 5, 2], "input.txt", "output.txt", False)

120 40 80 80 120 40 4 5 5 5 2 input.txt output.txt


In [None]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 120, 40, [5, 5, 5, 2], "input.txt", "output.txt", False)

In [None]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 120, 40, [5, 5, 5, 2], "input.txt", "output.txt", False)

In [9]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 10, 40, [4, 4, 4, 2], "input.txt", "output.txt", False)

120 40 80 80 10 40 4 4 4 4 2 input.txt output.txt
Execution time for input.txt: 98.37438774108887 seconds
Errors (should be blank): 
Number of baits: 677


[677, 98.37438774108887]

In [111]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 10, 40, [10, 10, 10, 2], "input.txt", "output.txt", False)

120 40 80 80 10 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 67.46561574935913 seconds
Errors (should be blank): 
Number of baits: 656


[656, 67.46561574935913]

In [112]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 10, 40, [8, 8, 8, 2], "input.txt", "output.txt", False)

120 40 80 80 10 40 4 8 8 8 2 input.txt output.txt
Execution time for input.txt: 61.728360652923584 seconds
Errors (should be blank): 
Number of baits: 659


[659, 61.728360652923584]

In [120]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 10, 40, [3, 4, 5, 2], "input.txt", "output.txt", False)

120 40 80 80 10 40 4 3 4 5 2 input.txt output.txt
Execution time for input.txt: 50.76568078994751 seconds
Errors (should be blank): 
Number of baits: 669


[669, 50.76568078994751]

In [114]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 10, 40, [5, 4, 3, 2], "input.txt", "output.txt", False)

120 40 80 80 10 40 4 5 4 3 2 input.txt output.txt
Execution time for input.txt: 45.285422563552856 seconds
Errors (should be blank): 
Number of baits: 690


[690, 45.285422563552856]

In [115]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 20, 40, [5, 5, 5, 2], "input.txt", "output.txt", False)

120 40 80 80 20 40 4 5 5 5 2 input.txt output.txt
Execution time for input.txt: 74.68547320365906 seconds
Errors (should be blank): 
Number of baits: 214


[214, 74.68547320365906]

In [118]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 10, 40, [4, 3, 2], "input.txt", "output.txt", False)

120 40 80 80 10 40 3 4 3 2 input.txt output.txt
Execution time for input.txt: 59.17605209350586 seconds
Errors (should be blank): 
Number of baits: 962


[962, 59.17605209350586]

In [119]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
run(120, 40, 80, 80, 10, 40, [2, 3, 4], "input.txt", "output.txt", False)

120 40 80 80 10 40 3 2 3 4 input.txt output.txt
Execution time for input.txt: 46.22722148895264 seconds
Errors (should be blank): 
Number of baits: 689


[689, 46.22722148895264]

In [108]:
with open(synthetic100, 'r') as file:
    original = [line.strip() for line in file.readlines()]
with open("input.txt", 'w') as file:
    file.write(original[1])
iterations=30
total_bait=0
total_time=0
for i in range(iterations):
    op=run(120, 40, 80, 80, 10, 40, [4, 4, 4, 2], "input.txt", "output.txt", False)
    print(op)
    total_bait+=op[0]
    total_time+=op[1]
print("avg bait:",total_bait/30)
print("avg time:",total_time/30)
total_bait=0
total_time=0
for i in range(iterations):
    op=run(120, 40, 80, 80, 10, 40, [3, 4, 5, 2], "input.txt", "output.txt", False)
    total_bait+=op[0]
    total_time+=op[1]
print("avg bait:",total_bait/30)
print("avg time:",total_time/30)
total_bait=0
total_time=0
for i in range(iterations):
    op=run(120, 40, 80, 80, 10, 40, [5, 4, 3, 2], "input.txt", "output.txt", False)
    total_bait+=op[0]
    total_time+=op[1]
print("avg bait:",total_bait/30)
print("avg time:",total_time/30)
total_bait=0
total_time=0
for i in range(iterations):
    op=run(120, 40, 80, 80, 10, 40, [5, 5, 5, 2], "input.txt", "output.txt", False)
    total_bait+=op[0]
    total_time+=op[1]
print("avg bait:",total_bait/30)
print("avg time:",total_time/30)
total_bait=0
total_time=0
for i in range(iterations):
    op=run(120, 40, 80, 80, 10, 40, [4, 4, 4], "input.txt", "output.txt", False)
    total_bait+=op[0]
    total_time+=op[1]
print("avg bait:",total_bait/30)
print("avg time:",total_time/30)
run(120, 40, 80, 80, 10, 40, [10, 10, 10], "input.txt", "output.txt", False)

120 40 80 80 10 40 4 4 4 4 2 input.txt output.txt
Execution time for input.txt: 49.92495250701904 seconds
Errors (should be blank): 
Number of baits: 678
[678, 49.92495250701904]
120 40 80 80 10 40 4 4 4 4 2 input.txt output.txt
Execution time for input.txt: 52.58348059654236 seconds
Errors (should be blank): 
Number of baits: 677
[677, 52.58348059654236]
120 40 80 80 10 40 4 4 4 4 2 input.txt output.txt
Execution time for input.txt: 54.97350549697876 seconds
Errors (should be blank): 
Number of baits: 679
[679, 54.97350549697876]
120 40 80 80 10 40 4 4 4 4 2 input.txt output.txt
Execution time for input.txt: 52.396759271621704 seconds
Errors (should be blank): 
Number of baits: 678
[678, 52.396759271621704]
120 40 80 80 10 40 4 4 4 4 2 input.txt output.txt
Execution time for input.txt: 49.482104778289795 seconds
Errors (should be blank): 
Number of baits: 678
[678, 49.482104778289795]
120 40 80 80 10 40 4 4 4 4 2 input.txt output.txt
Execution time for input.txt: 49.5895779132843 seco

KeyboardInterrupt: 

In [14]:
path='synthetic_120_240_nospace'
for item in sorted(os.listdir(path)):#sorted and actual file explorer break sort ties differently
    path2=path+'/'+item
    for item2 in sorted(os.listdir(path2)):
        path3=path2+'/'+item2gg
        for item3 in sorted(os.listdir(path3)):
            path4=path3+'/'+item3
            for item4 in sorted(os.listdir(path4)):
                path5=path4+'/'+item4you
                for item5 in sorted(os.listdir(path5)):
                    path6=path5+'/'+item5
                    base, extension=os.path.splitext(item5)
                    if extension=='.fasta': 
                        print("File:",item5)
                        with open(path6, 'r') as file:
                            original = [line.strip() for line in file.readlines()]
                        with open("input.txt", 'w') as file:
                            file.write(original[1])
                        run(120, 40, 80, 80, 50, 40, [5, 5, 5, 2], "input.txt", "output.txt", True)

File: L1000000_RL(120,240)_RC0_RE40_no0.fasta
120 40 80 80 50 40 4 5 5 5 2 input.txt output.txt
Execution time for input.txt: 75508.2121500969 seconds
Errors (should be blank): 
inputFile:  input.txt
Number of baits: 4740


KeyboardInterrupt: 

In [11]:
path='synthetic_120_240_nospace_tuned'
for item in sorted(os.listdir(path)):#sorted and actual file explorer break sort ties differently
    path2=path+'/'+item
    print(path2)
    for item2 in sorted(os.listdir(path2)):
        path3=path2+'/'+item2
        print(path3)
        # if(item2!="L=500000"):
        #     continue
        for item3 in sorted(os.listdir(path3)):
            path4=path3+'/'+item3
            print(path4)
            for item4 in sorted(os.listdir(path4)):
                path5=path4+'/'+item4
                print(path5)
                bait_total=0
                time_total=0
                iter=0
                for item5 in sorted(os.listdir(path5)):
                    path6=path5+'/'+item5
                    print(path6)
                    base, extension=os.path.splitext(item5)
                    if extension=='.fasta':
                        print("File:",item5)
                        with open(path6, 'r') as file:
                            original = [line.strip() for line in file.readlines()]
                        with open("input.txt", 'w') as file:
                            file.write(original[1])
                        #op=tune_params(-1,120,40,"input.txt","output.txt",False)
                        #bait_total+=op[0][0]
                        #time_total+=op[0][1]
                        if path6!='synthetic_120_240_nospace_tuned/RL=120_240/L=500000/RN=166/RC=0.5/L500000_RL(120,240)_RC50_RE40_no0.fasta':
                            iter+=1
                            continue
                        op=run(120, 40, 80, 80, 30, 40, [10, 10, 10, 2], "input.txt", "output.txt", (path6=='synthetic_120_240_nospace_tuned/RL=120_240/L=500000/RN=166/RC=0.5/L500000_RL(120,240)_RC50_RE40_no0.fasta'))                       
                        bait_total+=op[0]
                        time_total+=op[1]
                     
                        iter+=1
                        if op[1]>1000:
                            brea0
                print("avg bait",bait_total/iter)
                print("avg time",time_total/iter)

synthetic_120_240_nospace_tuned/RL=120_240
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no0.fasta
File: L1000000_RL(120,240)_RC50_RE40_no0.fasta
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no0.log
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no1.fasta
File: L1000000_RL(120,240)_RC50_RE40_no1.fasta
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no1.log
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no2.fasta
File: L1000000_RL(120,240)_RC50_RE40_no2.fasta
synthetic_120_240_nospace_tuned/RL=120_240/L=1000000/RN=333/RC=0.5/L1000000_RL(120,240)_RC50_RE40_no

NameError: name 'brea0' is not defined

In [11]:
run(120, 40, 80, 80, 1, 40, [5, 5, 5, 2], 'input.txt', "output.txt", True)

120 40 80 80 1 40 4 5 5 5 2 input.txt output.txt
Execution time for input.txt: 2.9669594764709473 seconds
Errors (should be blank):  i dont even know anymore
ERROR! some values not ACGTN
problem amount 1

inputFile:  input.txt
Number of baits: 365
[92m 0 spots uncovered[0m
Verification time for input.txt: 202.86331295967102 seconds


[365, <module 'time' (built-in)>]

In [None]:
for nL in L:
    for nNR in NR:
        for nP in P:
            start_time = time.time()
            synth=synthesize_sequence(nL,nNR,RepeatLength,nP,RepeatNoise)
            end_time = time.time()
            duration = end_time - start_time
            print(f"Time to create string: {duration} seconds")
            with open("synth.txt", 'w') as file:
                # Write the string to the file
                file.write(synth)
            run(120,40,80,80,1,40,[5,5,5,2],"synth.txt","output.txt",True)

In [None]:
for nL in L:
    for nNR in NR:
        for nP in P:
            start_time = time.time()
            synth=synthesize_sequence(nL,nNR,RepeatLength,nP,RepeatNoise)
            end_time = time.time()
            duration = end_time - start_time
            print(f"Time to create string: {duration} seconds")
            with open("synth.txt", 'w') as file:
                # Write the string to the file
                file.write(synth)
            run(120,40,80,80,5,40,[5,5,5,2],"synth.txt","output.txt")

In [None]:
for nL in L:
    for nNR in NR:
        for nP in P:
            start_time = time.time()
            synth=synthesize_sequence(nL,nNR,RepeatLength,nP,RepeatNoise)
            end_time = time.time()
            duration = end_time - start_time
            print(f"Time to create string: {duration} seconds")
            with open("synth.txt", 'w') as file:
                # Write the string to the file
                file.write(synth)
            run(120,40,80,80,20,40,[5,5,5,2],"synth.txt","output.txt")

In [None]:
for nL in L:
    for nNR in NR:
        for nP in P:
            start_time = time.time()
            synth=synthesize_sequence(nL,nNR,RepeatLength,nP,RepeatNoise)
            end_time = time.time()
            duration = end_time - start_time
            print(f"Time to create string: {duration} seconds")
            with open("synth.txt", 'w') as file:
                # Write the string to the file
                file.write(synth)
            run(120,40,80,80,60,40,[5,5,5,2],"synth.txt","output.txt")

In [None]:
lenientRadius=[60,70,80]
lenientRadius2=[60,70,80]
bypassHyperparameter=[30,40,50]
overlapCount=[1,2,3,4,5,8,10,12,15,20,25,30,40,50,60]
searchBreadths=[[5,7,10],[10,7,5],[7,7,7],[5,7,10,3],[10,7,5,3],[7,7,7,3],[7,7,7,3,3]]

In [None]:
baitResults = [[None for _ in range(len(overlapCount))] for _ in range(len(searchBreadths))]
timeResults = [[None for _ in range(len(overlapCount))] for _ in range(len(searchBreadths))]
its=5
for sb in range(len(searchBreadths)):
    for oc in range(len(overlapCount)):
        for i in range(its):
            ans=run(120,40,80,80,overlapCount[oc],40,searchBreadths[sb],"testInput.txt","asdf.txt")
            baitResults[sb][overlapCount]+=ans[0]
            timeResults[sb][overlapCount]+=ans[1]
        baitResults[sb][overlapCount]/=its
        timeResults[sb][overlapCount]/=its

In [None]:
run(120,40,80,80,60,40,[1,2,3,4],"testInput.txt","asdf.txt")

cluster size graph (what to do about big clusters?)

cluster overlap metric (graph with x axis from 1-120)

swiss cheese error metric

problem of getting syotti to run

do the hyperparameter tuning for you

In [None]:
path='megaresPartitions'
skip=0
for item in sorted(os.listdir(path), key=lambda x: os.path.getsize(os.path.join(path, x))):#sorted and actual file explorer break sort ties differently
    path2=path+'/'+item
    print(path2)
    if skip!=0:
        skip-=1
        continue
    with open(path2, 'r') as file:
        original = [line.strip() for line in file.readlines()]
    with open("input.txt", 'w') as file:
        for lin in original:
            file.write(lin)
    op=run(120, 40, 80, 80, 1, 40, [10, 10, 10, 2], "input.txt", "output.txt", path2=='megaresClean5E5')                       
    print("bait",op[0])
    print("time",op[1])

In [None]:
path='megaresPartitions'
skip=0
for item in sorted(files, key=lambda x: os.path.getsize(os.path.join(path, x))):#sorted and actual file explorer break sort ties differently
    path2=path+'/'+item
    print(path2)
    if skip!=0:
        skip-=1
        continue
    with open(path2, 'r') as file:
        original = [line.strip() for line in file.readlines()]
    with open("input.txt", 'w') as file:
        for lin in original:
            file.write(lin)
    op=run(120, 40, 80, 80, 5, 40, [10, 10, 10, 2], "input.txt", "output.txt", path2=='megaresClean5E5')                       
    print("bait",op[0])
    print("time",op[1])

megaresPartitions/megaresClean1E6.txt
120 40 80 80 5 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 183.10450792312622 seconds
Errors (should be blank): 
Number of baits: 2189
avg bait 2189
avg time 183.10450792312622
megaresPartitions/megaresClean25E4.txt
120 40 80 80 5 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 32.2203848361969 seconds
Errors (should be blank): 
Number of baits: 679
avg bait 679
avg time 32.2203848361969
megaresPartitions/megaresClean2E6.txt
120 40 80 80 5 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 536.5759718418121 seconds
Errors (should be blank): 
Number of baits: 4088
avg bait 4088
avg time 536.5759718418121
megaresPartitions/megaresClean4E6.txt
120 40 80 80 5 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 1840.8940541744232 seconds
Errors (should be blank): 
Number of baits: 8375
avg bait 8375
avg time 1840.8940541744232
megaresPartitions/megaresClean5E5.txt
120 40 80 80 5 40

In [11]:
path='megaresPartitions'
skip=0
for item in sorted(os.listdir(path), key=lambda x: os.path.getsize(os.path.join(path, x))):#sorted and actual file explorer break sort ties differently
    path2=path+'/'+item
    print(path2)
    if skip!=0:
        skip-=1
        continue
    with open(path2, 'r') as file:
        original = [line.strip() for line in file.readlines()]
    with open("input.txt", 'w') as file:
        for lin in original:
            file.write(lin)
    op=run(120, 40, 80, 80, 10, 40, [10, 10, 10, 2], "input.txt", "output.txt", path2=='megaresClean5E5')                       
    print("bait",op[0])
    print("time",op[1])

megaresPartitions/megaresClean25E4.txt
120 40 80 80 10 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 65.76274681091309 seconds
Errors (should be blank): 
Number of baits: 581
bait 581
time 65.76274681091309
megaresPartitions/megaresClean5E5.txt
120 40 80 80 10 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 146.08677053451538 seconds
Errors (should be blank): 
Number of baits: 929
bait 929
time 146.08677053451538
megaresPartitions/megaresClean1E6.txt
120 40 80 80 10 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 409.69615268707275 seconds
Errors (should be blank): 
Number of baits: 1818
bait 1818
time 409.69615268707275
megaresPartitions/megaresClean2E6.txt
120 40 80 80 10 40 4 10 10 10 2 input.txt output.txt


KeyboardInterrupt: 

In [12]:
path='megaresPartitions'
skip=0
for item in sorted(os.listdir(path), key=lambda x: os.path.getsize(os.path.join(path, x))):#sorted and actual file explorer break sort ties differently
    path2=path+'/'+item
    print(path2)
    if skip!=0:
        skip-=1
        continue
    with open(path2, 'r') as file:
        original = [line.strip() for line in file.readlines()]
    with open("input.txt", 'w') as file:
        for lin in original:
            file.write(lin)
    op=run(120, 40, 80, 80, 20, 40, [10, 10, 10, 2], "input.txt", "output.txt", path2=='megaresClean5E5')                       
    print("bait",op[0])
    print("time",op[1])

megaresPartitions/megaresClean25E4.txt
120 40 80 80 20 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 172.5909698009491 seconds
Errors (should be blank): 
Number of baits: 517
bait 517
time 172.5909698009491
megaresPartitions/megaresClean5E5.txt
120 40 80 80 20 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 395.1159563064575 seconds
Errors (should be blank): 
Number of baits: 796
bait 796
time 395.1159563064575
megaresPartitions/megaresClean1E6.txt
120 40 80 80 20 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 1247.1374588012695 seconds
Errors (should be blank): 
Number of baits: 1582
bait 1582
time 1247.1374588012695
megaresPartitions/megaresClean2E6.txt
120 40 80 80 20 40 4 10 10 10 2 input.txt output.txt
Execution time for input.txt: 4013.2521896362305 seconds
Errors (should be blank): 
Number of baits: 3096
bait 3096
time 4013.2521896362305
megaresPartitions/megaresClean4E6.txt
120 40 80 80 20 40 4 10 10 10 2 input.txt out

KeyboardInterrupt: 