In [1]:
import concurrent.futures
import random

import polars as pl
import plotnine as p9

from tools.constants import (
    AMINO_ACID_TO_CODONS_MAP,
    CODON_TO_AMINO_ACID_MAP,
    SEQUENCE_EGFP,
    SEQUENCE_LUCIFERASE,
)
from tools.sequence.sequence import Sequence
from tools.types import Organism
from tools.data import load_codon_usage_table

In [12]:
PROTEINS = {
    "EGFP": SEQUENCE_EGFP,
    "LUCIFERASE": SEQUENCE_LUCIFERASE,
}
NUM_SEQUENCES_PER_CAI = 100
CAI_RANGE = [
    0.88,
    0.89,
    0.90,
    0.91,
    0.92,
    0.93,
    0.94,
    0.95,
    0.96,
    0.97,
    0.98,
    0.99,
    1.0,
]


def _min_cai_sequence(s: Sequence, organism: Organism = "homo-sapiens") -> Sequence:
    codon_usage_table = load_codon_usage_table(organism)
    new_codons = []
    for codon in s.codons:
        amino_acid = CODON_TO_AMINO_ACID_MAP[codon]
        codon_usage = codon_usage_table.least_frequent(amino_acid)
        new_codons.append(codon_usage.codon)
    return Sequence.from_string("".join(new_codons))


def _optimize(
    name: str,
    sequence: Sequence,
    target_cai: float,
    organism: Organism = "homo-sapiens",
    iterations: int = 20_000,
    mutations_per_iteration: int = 2,
) -> Sequence:
    best_sequence = sequence
    for _ in range(iterations):
        best_diff = abs(best_sequence.codon_adaptation_index(organism) - target_cai)
        if best_diff / cai_target < 0.005:
            # Quit early if very close to target CAI (within 0.5%)
            break
        next_sequence = Sequence.from_string(str(best_sequence))
        for _ in range(mutations_per_iteration):
            location = random.randrange(0, len(best_sequence), 3)
            codon = str(next_sequence[location : location + 3])
            amino_acid = CODON_TO_AMINO_ACID_MAP[codon]
            next_codon = random.choice(list(AMINO_ACID_TO_CODONS_MAP[amino_acid]))
            next_sequence = (
                next_sequence[:location]
                + Sequence.from_string(next_codon)
                + next_sequence[location + 3 :]
            )

        next_diff = abs(next_sequence.codon_adaptation_index(organism) - target_cai)
        if next_diff < best_diff:
            best_sequence = next_sequence
    return {
        "name": name,
        "target_cai": target_cai,
        "output_sequence": str(best_sequence),
    }


min_cai_sequences = {
    name: _min_cai_sequence(Sequence.from_string(s)) for name, s in PROTEINS.items()
}

with concurrent.futures.ProcessPoolExecutor() as executor:
    futures = [
        executor.submit(_optimize, name, s, target_cai)
        for name, s in min_cai_sequences.items()
        for target_cai in CAI_RANGE
        for _ in range(NUM_SEQUENCES_PER_CAI)
    ]
    print(len(futures))
    done, not_done = concurrent.futures.wait(futures)
    assert not not_done
    results = [d.result() for d in done]

df = pl.DataFrame(results).with_columns(
    pl.col("output_sequence")
    .map_elements(
        lambda s: Sequence.from_string(s).codon_adaptation_index(),
        return_dtype=pl.Float64,
    )
    .alias("output_cai")
)
df

2600


name,target_cai,output_sequence,output_cai
str,f64,str,f64
"""EGFP""",0.9,"""ATGGTGAGCAAAGGCGAGGAGCTCTTCACC…",0.897586
"""EGFP""",0.95,"""ATGGTGTCCAAGGGAGAGGAGCTGTTCACC…",0.947128
"""EGFP""",0.99,"""ATGGTGAGCAAGGGCGAGGAGCTGTTCACC…",0.986214
"""LUCIFERASE""",1.0,"""ATGGAGGACGCCAAGAACATCAAGAAGGGC…",0.99615
"""EGFP""",0.9,"""ATGGTGAGCAAGGGCGAGGAGCTGTTCACA…",0.896019
…,…,…,…
"""LUCIFERASE""",0.95,"""ATGGAGGACGCCAAGAACATCAAGAAGGGA…",0.946896
"""LUCIFERASE""",0.95,"""ATGGAGGACGCCAAGAACATTAAGAAGGGC…",0.945794
"""LUCIFERASE""",0.95,"""ATGGAGGATGCTAAGAACATTAAGAAGGGC…",0.94678
"""LUCIFERASE""",0.95,"""ATGGAGGATGCTAAGAACATCAAGAAGGGC…",0.946634


In [14]:
df.write_csv("mRNArchitect.csv")