In [7]:
from math import isqrt
import random
from proseqteleporter.random_sample_generator.random_sample_generator import generate_random_amino_acid_sequence

AMINO_ACIDS = 'ACDEFGHIKLMNPQRSTVWY'

def prime_factors(n):
    """Returns a list of prime factors of n."""
    factors = []
    # Check for number of 2s
    while n % 2 == 0:
        factors.append(2)
        n //= 2
    # Check for other primes
    for i in range(3, isqrt(n) + 1, 2):
        while n % i == 0:
            factors.append(i)
            n //= i
    if n > 2:
        factors.append(n)
    return factors


def distribute_factors(factors, m):
    """Distribute factors into m parts."""
    parts = [1] * m
    for i, factor in enumerate(factors):
        parts[i % m] *= factor
    return parts


def find_factors_list(n, m):
    """Finds a list of m integers such that their product is n."""
    factors = prime_factors(n)
    if len(factors) < m:
        return f"Cannot distribute {n} into {m} parts"
    return distribute_factors(factors, m)


In [8]:
find_factors_list(10000000000, 10)

[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]

In [9]:
def aa_mutation_generator_set_complexity(aa_seq: str, complexity: int,
                                         min_number_of_positions: int, max_number_of_positions: int,
                                         min_variations_per_position: int, max_variations_per_position: int,
                                         amino_acids: str) -> list:

    number_of_positions_lst = list(range(min_number_of_positions, max_number_of_positions+1))
    random.shuffle(number_of_positions_lst)

    variations_per_position = ''
    for n_positions in number_of_positions_lst:
        variations_per_position_incl_wt = find_factors_list(complexity, n_positions)

        if isinstance(variations_per_position_incl_wt, list):
            variations_per_position = [i-1 for i in variations_per_position_incl_wt]
            if max_variations_per_position >= max(variations_per_position) and min_variations_per_position <= min(variations_per_position):
                random.shuffle(variations_per_position)
                break
            else:
                variations_per_position = 'not pass'

    if not isinstance(variations_per_position, list):
        return  variations_per_position

    mutation_positions = random.sample(range(len(aa_seq)), k=(len(variations_per_position)))
    mutations = []
    for position, number_of_variations in zip(mutation_positions, variations_per_position):
        wt_aa = aa_seq[position]
        variations = random.sample(amino_acids.replace(wt_aa, ""), k=number_of_variations)
        mutations.append({'position': position + 1,  # adjust to 1-indexing
                          'aa': variations})

    return sorted(mutations, key=lambda x: (x['position']))

In [10]:
inputs = dict(SEQUENCE=None,
              FIX_DNA_SEQUENCE="",
              MUTATIONS=None,
              LINKED_MUTATIONS=None,
              CUT_NUMBER_RANGE=(0,5),
              MIN_FRAGMENT_LENGTH=6,
              MAX_COST=30000000,
              MAX_LENGTH_UNEVENNESS=49,
              MIN_LIGATION_FIDELITY=0.98,
              SATISFACTION_LIGATION_FIDELITY=0.98,
              FIDELITY_DATA="FileS01_T4_01h_25C.xlsx",
              HOST="c_griseus",
              FUSION_SITES_USED_BY_BACKBONE=('CTTG', 'TAAT'),
              DNA_5_PRIME="AATTTGGTCTCTCC",
              DNA_3_PRIME="TAATAGAGACCTTTAA",
              ALLOWED_CUT_POSITIONS=[],
              COST_PER_BP=0.1,
              MIN_DNA_LENGTH=1,
              MAX_DNA_LENGTH=10000000)


In [11]:
from os import path, mkdir
SCRIPT_DIR = path.dirname(path.abspath('__file__'))

input_params = {
    'fix_complexity_diff_muts_10K':dict(complexity=10000,min_number_of_positions=6, max_number_of_positions=8),
    'fix_complexity_diff_muts_1M':dict(complexity=1000000,min_number_of_positions=6, max_number_of_positions=12),
    'fix_complexity_diff_muts_100M':dict(complexity=100000000,min_number_of_positions=8, max_number_of_positions=16),
    'fix_complexity_diff_muts_10B':dict(complexity=10000000000,min_number_of_positions=10, max_number_of_positions=20)
}

for k, v in input_params.items():
    inputs_dir = path.join(SCRIPT_DIR,f'{k}')
    complexity=v['complexity']
    min_number_of_positions=v['min_number_of_positions']
    max_number_of_positions=v['max_number_of_positions']
    if not path.isdir(inputs_dir):
        mkdir(inputs_dir)
    seen_muts = []
    for rep in range(0,50):
        seq = generate_random_amino_acid_sequence(min_length=300, max_length=300, amino_acids=AMINO_ACIDS)
        muts = aa_mutation_generator_set_complexity(aa_seq=seq,
                                                    complexity=complexity,
                                                    min_number_of_positions=min_number_of_positions,
                                                    max_number_of_positions=max_number_of_positions,
                                                    min_variations_per_position=1,
                                                    max_variations_per_position=19,
                                                    amino_acids=AMINO_ACIDS)
        if muts not in seen_muts:
            seen_muts.append(muts)
            inputs.update({'SEQUENCE':seq, 'MUTATIONS':muts})
            with open(path.join(inputs_dir,f'input_rep{rep}.txt'), 'w') as f:
                for k, v in inputs.items():
                    f.write(f'{k}={v}\n')

In [12]:
class Input:
    def __init__(self, seq, mutations_1idx, linked_mutations_1idx, cut_number_range, fidelity_data_path, max_cost, max_unevenness, min_aa_length, min_ligation_fidelity, satisfaction_fidelity, codon_usage_tbl_dir, host, provider_max_dna_length, fusion_sites_used_by_backbone, allowed_cut_positions_1idx, five_prime_dna, three_prime_dna, fix_wt_dna_sequence, enzyme):
        self.seq = seq
        self.mutations_1idx = mutations_1idx
        self.linked_mutations_1idx = linked_mutations_1idx
        self.cut_number_range = cut_number_range
        self.fidelity_data_path = fidelity_data_path
        self.max_cost = max_cost
        self.max_unevenness = max_unevenness
        self.min_aa_length = min_aa_length
        self.min_ligation_fidelity = min_ligation_fidelity
        self.satisfaction_fidelity = satisfaction_fidelity
        self.codon_usage_tbl_dir = codon_usage_tbl_dir
        self.host = host
        self.provider_max_dna_length = provider_max_dna_length
        self.fusion_sites_used_by_backbone = fusion_sites_used_by_backbone
        self.allowed_cut_positions_1idx = allowed_cut_positions_1idx
        self.five_prime_dna = five_prime_dna
        self.three_prime_dna = three_prime_dna
        self.fix_wt_dna_sequence = fix_wt_dna_sequence
        self.enzyme = enzyme
