In [20]:
import concurrent.futures
import dataclasses
import os
import pathlib
import random

from dnachisel.builtin_specifications import EnforceGCContent, EnforceTranslation
from dnachisel.builtin_specifications.codon_optimization import CodonOptimize
from dnachisel.DnaOptimizationProblem import DnaOptimizationProblem, NoSolutionError
import plotnine as p9
import polars as pl

from tools.organism import KAZUSA_HOMO_SAPIENS, load_organism_from_web
from tools.sequence import Sequence
from tools.utils.fasta import parse_fasta_file

DATA_DIR = pathlib.Path(os.getcwd()) / ".." /"_analysis"
OUTPUT_DIR = DATA_DIR / "output"

In [21]:
SUBSAMPLE_LIMIT = 2000

@dataclasses.dataclass
class InputSequence:
    source: str
    name: str
    sequence: Sequence | None

input_files = [path for path in DATA_DIR.iterdir() if path.is_file()]
sequences: list[InputSequence] = []
for file in input_files:
    next_sequences = [
        InputSequence(source=file.name, name=it[0], sequence=it[1])
        for it in parse_fasta_file(file)
    ]
    if SUBSAMPLE_LIMIT and SUBSAMPLE_LIMIT < len(next_sequences):
        next_sequences = random.sample(next_sequences, SUBSAMPLE_LIMIT)
    sequences.extend(next_sequences)
print(len(sequences))

5536


In [22]:
GC_MIN = 0.0
GC_MAX = 0.70
GC_WINDOW = 100
MIN_CAI = 0.9
ORGANISM = "kazusa-9606"  # human

def _optimize(input_sequence: InputSequence, organism):
    if not input_sequence.sequence:
        return input_sequence, None, "Invalid sequence."

    if len(input_sequence.sequence) % 3 != 0:
        return input_sequence, None, "Sequence out of frame (length not multiple of 3)."
    
    problem = DnaOptimizationProblem(
        sequence=input_sequence.sequence.nucleic_acid_sequence,
        constraints=[
            EnforceGCContent(mini=GC_MIN, maxi=GC_MAX, window=GC_WINDOW),
            EnforceTranslation(),
        ],
        objectives=[
            CodonOptimize(
                codon_usage_table=organism.to_dnachisel_dict(),
                method="use_best_codon"
            )
        ],
        logger=None,
    )
    problem.max_random_iters = 20_000

    output_sequence = None
    failed_reason = None
    try:
        problem.resolve_constraints()
        problem.optimize()
        output_sequence = Sequence.from_string(problem.sequence, "nucleic-acid")
        if output_sequence.codon_adaptation_index(organism) < MIN_CAI:
            failed_reason = f"CAI < {MIN_CAI}"
    except NoSolutionError:
        failed_reason = "GC content constraint not resolvable."

    return input_sequence, output_sequence, failed_reason

organism = load_organism_from_web(ORGANISM)
with concurrent.futures.ProcessPoolExecutor() as executor:
    results = list(
        executor.map(
            _optimize,
            sequences,
            [organism for _ in sequences],
        )
    )

In [23]:
df = pl.DataFrame([
    {
        "source": it[0].source,
        "name": it[0].name,
        "input_sequence": str(it[0].sequence) if it[0].sequence else "",
        "output_sequence": str(it[1]) if it[1] else "",
        "failed_reason": it[2] if it[2] else "",
    }
    for it in results
])
df.write_csv(OUTPUT_DIR / "1-optimise.csv")
df

source,name,input_sequence,output_sequence,failed_reason
str,str,str,str,str
"""hrt_hk_nt_seq_pulled_from_ense…","""ENSG00000100387|ENSG0000010038…","""ATGGCGGCAGCGATGGATGTGGATACCCCG…","""ATGGCCGCCGCCATGGACGTGGACACCCCC…",""""""
"""hrt_hk_nt_seq_pulled_from_ense…","""ENSG00000111639|ENSG0000011163…","""ATGGCAGGGAACCTCTTATCCGGGGCAGGT…","""ATGGCCGGCAACCTGCTGAGCGGCGCCGGC…",""""""
"""hrt_hk_nt_seq_pulled_from_ense…","""ENSG00000239789|ENSG0000023978…","""ATGTCCGTAGTTCGCTCATCCGTCCATGCC…","""ATGAGCGTGGTGAGAAGCAGCGTGCACGCC…",""""""
"""hrt_hk_nt_seq_pulled_from_ense…","""ENSG00000171148|ENSG0000017114…","""ATGAGTGAGTTGAAAGACTGCCCCTTGCAG…","""ATGAGCGAGCTGAAGGACTGCCCCCTGCAG…",""""""
"""hrt_hk_nt_seq_pulled_from_ense…","""ENSG00000169020|ENSG0000016902…","""ATGGTGCCACCGGTGCAGGTCTCTCCGCTC…","""ATGGTGCCCCCCGTGCAGGTGAGCCCCCTG…",""""""
…,…,…,…,…
"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000198300|ENSG0000019830…","""ATGTACCAACCAGAAGACGACAACAACAGT…","""ATGTACCAGCCCGAGGACGACAACAACAGC…",""""""
"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000162727|ENSG0000016272…","""ATGGCATGGGAGAATCAGACCTTCAACTCT…","""ATGGCCTGGGAGAACCAGACCTTCAACAGC…",""""""
"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000187514|ENSG0000018751…","""ATGTCAGACGCAGCCGTAGACACCAGCTCC…","""ATGAGCGACGCCGCCGTGGACACCAGCAGC…",""""""
"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000145016|ENSG0000014501…","""ATGCAGAGCATCCTCTATCACGGGCTTATC…","""ATGCAGAGCATCCTGTACCACGGCCTGATC…",""""""
