In [5]:
import concurrent.futures
import dataclasses
import os
import pathlib
import random

from dnachisel.builtin_specifications import EnforceGCContent, EnforceTranslation
from dnachisel.builtin_specifications.codon_optimization import CodonOptimize
from dnachisel.DnaOptimizationProblem import DnaOptimizationProblem, NoSolutionError
import plotnine as p9
import polars as pl

from tools.organism import KAZUSA_HOMO_SAPIENS, load_organism
from tools.sequence import Sequence
from tools.utils.fasta import parse_fasta_file

DATA_DIR = pathlib.Path(os.getcwd()) / ".." / ".." /"_analysis"
OUTPUT_DIR = DATA_DIR / "output"

In [15]:
SUBSAMPLE_LIMIT = None

@dataclasses.dataclass
class InputSequence:
    source: str
    name: str
    sequence: Sequence | None

input_files = [path for path in DATA_DIR.iterdir() if path.is_file()]
sequences: list[InputSequence] = []
for file in input_files:
    next_sequences = [
        InputSequence(source=file.name, name=it[0], sequence=it[1])
        for it in parse_fasta_file(file)
    ]
    if SUBSAMPLE_LIMIT and SUBSAMPLE_LIMIT < len(next_sequences):
        next_sequences = random.sample(next_sequences, SUBSAMPLE_LIMIT)
    sequences.extend(next_sequences)
print(len(sequences))

50713


In [16]:
import functools

from tools.constants import CODON_TO_AMINO_ACID_MAP

@functools.cache
def _max_codon(codon: str) -> str:
    organism = load_organism(KAZUSA_HOMO_SAPIENS)
    amino_acid = CODON_TO_AMINO_ACID_MAP[codon]
    return organism.max_codon(amino_acid)

@functools.cache
def _min_codon(codon: str) -> str:
    organism = load_organism(KAZUSA_HOMO_SAPIENS)
    amino_acid = CODON_TO_AMINO_ACID_MAP[codon]
    codons = [
        it
        for it in organism.codon_usage_table.values()
        if it.amino_acid == amino_acid
    ]
    return min(codons, key=lambda c: c.number).codon

def _max_cai(s: str) -> str | None:
    if len(s) % 3 != 0:
        return None
    organism = load_organism(KAZUSA_HOMO_SAPIENS)
    sequence = Sequence.from_string(s)
    output_sequence: str = "".join([
        _max_codon(codon)
        for codon in sequence.codons
    ])
    return output_sequence

def _min_cai(s: str) -> str | None:
    if len(s) % 3 != 0:
        return None
    organism = load_organism(KAZUSA_HOMO_SAPIENS)
    sequence = Sequence.from_string(s)
    output_sequence: str = "".join([
        _min_codon(codon)
        for codon in sequence.codons
    ])
    return output_sequence

df = pl.DataFrame({
    "source": [it.source for it in sequences],
    "name": [it.name for it in sequences],
    "sequence": [str(it.sequence) for it in sequences],
}).with_columns(
    max_cai_sequence = pl.col("sequence").map_elements(
        lambda x: _max_cai(x), return_dtype=str
    ),
    min_cai_sequence = pl.col("sequence").map_elements(
        lambda x: _min_cai(x), return_dtype=str
    ),
)
df

source,name,sequence,max_cai_sequence,min_cai_sequence
str,str,str,str,str
"""ena_cancer_vaccine_seq.fa""","""ENA|DD349577|DD349577.1""","""ATGGACTGGACCTGGAGGGTCTTCTTCTTG…",,
"""ena_cancer_vaccine_seq.fa""","""ENA|DD349576|DD349576.1""","""ATGGGATGGAGCTGTATCATCCTCTCCTTG…",,
"""ena_cancer_vaccine_seq.fa""","""ENA|DD349575|DD349575.1""","""TACCCATACGACGTCCCAGACTACGCTGGT…","""TACCCCTACGACGTGCCCGACTACGCCGGC…","""TATCCGTATGATGTACCGGATTATGCGGGT…"
"""ena_cancer_vaccine_seq.fa""","""ENA|DD349578|DD349578.1""","""ATGGACTGGACCTGGAGGGTCTTCTTCTTG…",,
"""ena_cancer_vaccine_seq.fa""","""ENA|DD349569|DD349569.1""","""AACAGCGAGGCCTGCCGGGACGGCCTTCGG…","""AACAGCGAGGCCTGCAGAGACGGCCTGAGA…","""AATTCGGAAGCGTGTCGTGATGGTCTACGT…"
…,…,…,…,…
"""iedb_antigen_aa_seqs.fa""","""sp|Q8IX19|MCEM1_HUMAN Mast cel…","""ATGGAGGTGGAGGAGATCTACAAGCACCAG…","""ATGGAGGTGGAGGAGATCTACAAGCACCAG…","""ATGGAAGTAGAAGAAATATATAAACATCAA…"
"""iedb_antigen_aa_seqs.fa""","""sp|Q9D287|SPF27_MOUSE Pre-mRNA…","""ATGGCCGGCACCGGCCTGGTGGCCGGCGAG…","""ATGGCCGGCACCGGCCTGGTGGCCGGCGAG…","""ATGGCGGGTACGGGTCTAGTAGCGGGTGAA…"
"""iedb_antigen_aa_seqs.fa""","""tr|Q4CN05|Q4CN05_TRYCC Trans-s…","""ATGAGCAGACACCTGTTCTACAGCGCCGTG…","""ATGAGCAGACACCTGTTCTACAGCGCCGTG…","""ATGTCGCGTCATCTATTTTATTCGGCGGTA…"
"""iedb_antigen_aa_seqs.fa""","""tr|Q5NG75|Q5NG75_FRATT Amino-a…","""ATGGACAACAACCAGGACAAGCTGAAGAGA…","""ATGGACAACAACCAGGACAAGCTGAAGAGA…","""ATGGATAATAATCAAGATAAACTAAAACGT…"


In [19]:
df = df.with_columns(
    max_cai_cai=pl.col("max_cai_sequence").map_elements(lambda x: Sequence(x).codon_adaptation_index(), return_dtype=pl.Float64),
    min_cai_cai=pl.col("min_cai_sequence").map_elements(lambda x: Sequence(x).codon_adaptation_index(), return_dtype=pl.Float64),
    max_cai_gc=pl.col("max_cai_sequence").map_elements(lambda x: Sequence(x).gc_ratio, return_dtype=pl.Float64),
    min_cai_gc=pl.col("min_cai_sequence").map_elements(lambda x: Sequence(x).gc_ratio, return_dtype=pl.Float64),
)
df.sort("max_cai_gc")

source,name,sequence,max_cai_sequence,min_cai_sequence,max_cai_gc,min_cai_gc,max_cai_cai,min_cai_cai
str,str,str,str,str,f64,f64,f64,f64
"""ena_cancer_vaccine_seq.fa""","""ENA|DD349577|DD349577.1""","""ATGGACTGGACCTGGAGGGTCTTCTTCTTG…",,,,,,
"""ena_cancer_vaccine_seq.fa""","""ENA|DD349576|DD349576.1""","""ATGGGATGGAGCTGTATCATCCTCTCCTTG…",,,,,,
"""ena_cancer_vaccine_seq.fa""","""ENA|DD349578|DD349578.1""","""ATGGACTGGACCTGGAGGGTCTTCTTCTTG…",,,,,,
"""ena_cancer_vaccine_seq.fa""","""ENA|HV541251|HV541251.1""","""GGCACGGCCTAGCGAGTGGTTCTTCTGCGC…",,,,,,
"""ena_cancer_vaccine_seq.fa""","""ENA|HW290778|HW290778.1""","""AAGCTTCAGGACCTCACCATGGGATGGAGC…",,,,,,
…,…,…,…,…,…,…,…,…
"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000049540|ENSG0000004954…","""ATGGCGGGTCTGACGGCGGCGGCCCCGCGG…","""ATGGCCGGCCTGACCGCCGCCGCCCCCAGA…","""ATGGCGGGTCTAACGGCGGCGGCGCCGCGT…",0.836166,0.6322,1.0,0.360165
"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000049540|ENSG0000004954…","""ATGGCGGGTCTGACGGCGGCGGCCCCGCGG…","""ATGGCCGGCCTGACCGCCGCCGCCCCCAGA…","""ATGGCGGGTCTAACGGCGGCGGCGCCGCGT…",0.836322,0.634483,1.0,0.359878
"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000049540|ENSG0000004954…","""ATGGCGGGTCTGACGGCGGCGGCCCCGCGG…","""ATGGCCGGCCTGACCGCCGCCGCCCCCAGA…","""ATGGCGGGTCTAACGGCGGCGGCGCCGCGT…",0.836957,0.636646,1.0,0.358703
"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000049540|ENSG0000004954…","""ATGGCGGGTCTGACGGCGGCGGCCCCGCGG…","""ATGGCCGGCCTGACCGCCGCCGCCCCCAGA…","""ATGGCGGGTCTAACGGCGGCGGCGCCGCGT…",0.840162,0.635812,1.0,0.359856
