In [1]:
import concurrent.futures
import os

from dnachisel.builtin_specifications import EnforceGCContent, EnforceTranslation
from dnachisel.builtin_specifications.codon_optimization import CodonOptimize
from dnachisel.DnaOptimizationProblem import DnaOptimizationProblem, NoSolutionError
import plotnine as p9
import polars as pl

from tools.organism import KAZUSA_HOMO_SAPIENS, load_organism_from_web
from tools.sequence import Sequence
from tools.utils.fasta import parse_fasta_file

In [4]:
import dataclasses
import pathlib

@dataclasses.dataclass
class InputSequence:
    source: str
    name: str
    sequence: Sequence | None

data_dir = pathlib.Path(os.getcwd()) / ".." / "_analysis_data"
output_dir = data_dir / "output"
input_files = [path for path in data_dir.iterdir() if path.is_file()]
sequences: list[InputSequence] = []
for file in input_files:
    for name, sequence in parse_fasta_file(file):
        sequences.append(InputSequence(source=file.name, name=name, sequence=sequence))
print(len(sequences))

50713


In [8]:
GC_MIN = 0.0
GC_MAX = 0.65
GC_WINDOW = 50
MIN_CAI = 0.9
ORGANISM = "kazusa-9606"  # human

def _optimize(input_sequence: InputSequence, organism):
    if not input_sequence.sequence:
        return input_sequence, None, "Invalid sequence."

    if len(input_sequence.sequence) % 3 != 0:
        return input_sequence, None, "Sequence out of frame (length not multiple of 3)."
    
    problem = DnaOptimizationProblem(
        sequence=input_sequence.sequence.nucleic_acid_sequence,
        constraints=[
            EnforceGCContent(mini=GC_MIN, maxi=GC_MAX, window=GC_WINDOW),
            EnforceTranslation(),
        ],
        objectives=[
            CodonOptimize(
                codon_usage_table=organism.to_dnachisel_dict(),
                method="use_best_codon"
            )
        ],
        logger=None,
    )
    problem.max_random_iters = 20_000

    output_sequence = None
    failed_reason = None
    try:
        problem.resolve_constraints()
        problem.optimize()
        output_sequence = Sequence.from_string(problem.sequence, "nucleic-acid")
        if output_sequence.codon_adaptation_index(organism) < MIN_CAI:
            failed_reason = f"CAI < {MIN_CAI}"
    except NoSolutionError:
        failed_reason = "GC content constraint not resolvable."

    return input_sequence, output_sequence, failed_reason

organism = load_organism_from_web(ORGANISM)
with concurrent.futures.ProcessPoolExecutor() as executor:
    results = list(
        executor.map(
            _optimize,
            sequences,
            [organism for _ in sequences],
        )
    )

In [13]:
df = pl.DataFrame([
    {
        "source": it[0].source,
        "name": it[0].name,
        "input_sequence": str(it[0].sequence) if it[0].sequence else "",
        "output_sequence": str(it[1]) if it[1] else "",
        "failed_reason": it[2] if it[2] else "",
    }
    for it in results
])
df.write_csv(output_dir / "1-optimised.csv")
df

source,name,input_sequence,output_sequence,failed_reason
str,str,str,str,str
"""hrt_hk_nt_seq_pulled_from_ense…","""ENSG00000004059|ENSG0000000405…","""ATGGGCCTCACCGTGTCCGCGCTCTTTTCG…","""ATGGGCCTGACCGTGAGCGCCCTGTTCAGC…",""""""
"""hrt_hk_nt_seq_pulled_from_ense…","""ENSG00000003056|ENSG0000000305…","""ATGTTCCCTTTCTACAGCTGCTGGAGGACT…","""ATGTTCCCCTTCTACAGCTGCTGGAGAACC…",""""""
"""hrt_hk_nt_seq_pulled_from_ense…","""ENSG00000006451|ENSG0000000645…","""ATGGCTGCAAATAAGCCCAAGGGTCAGAAT…","""ATGGCCGCCAACAAGCCCAAGGGCCAGAAC…",""""""
"""hrt_hk_nt_seq_pulled_from_ense…","""ENSG00000007520|ENSG0000000752…","""ATGGGCCGCAGGAGGGCAGCGCGCGGGCCG…","""ATGGGAAGAAGAAGAGCCGCCAGAGGCCCC…","""CAI < 0.9"""
"""hrt_hk_nt_seq_pulled_from_ense…","""ENSG00000004975|ENSG0000000497…","""ATGGCGGGTAGCAGCACTGGGGGCGGTGGG…","""ATGGCCGGATCTAGCACAGGCGGCGGAGGC…",""""""
…,…,…,…,…
"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000173846|ENSG0000017384…","""ATGGAGCCTGCCGCCGGTTTCCTGTCTCCG…","""""","""GC content constraint not reso…"
"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000142166|ENSG0000014216…","""ATGGATAATTGGATAAAATTGTCTGGGTGT…","""ATGGACAACTGGATCAAGCTGAGCGGCTGC…",""""""
"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000180488|ENSG0000018048…","""ATGTCAGACTGCTGCTCAGCGCCAGGCATC…","""ATGAGCGACTGCTGCTCTGCCCCTGGCATT…",""""""
"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000145216|ENSG0000014521…","""ATGTCGGCCGGCGAGGTCGAGCGCCTAGTG…","""ATGAGCGCTGGCGAAGTGGAAAGACTGGTG…",""""""
