In [1]:
import concurrent.futures
import pathlib
import os

import polars as pl
import plotnine as p9

from tools.sequence.sequence import Sequence

DATA_DIR = pathlib.Path(os.getcwd()) / ".." / ".." /"_analysis"
OUTPUT_DIR = DATA_DIR / "output"

In [2]:
df = pl.read_csv(OUTPUT_DIR / "1-optimise.csv")
df

source,name,input_sequence,output_sequence,failed_reason
str,str,str,str,str
"""ena_cancer_vaccine_seq.fa""","""ENA|DD349577|DD349577.1""","""ATGGACTGGACCTGGAGGGTCTTCTTCTTG…","""""","""Sequence out of frame (length …"
"""ena_cancer_vaccine_seq.fa""","""ENA|DD349576|DD349576.1""","""ATGGGATGGAGCTGTATCATCCTCTCCTTG…","""""","""Sequence out of frame (length …"
"""ena_cancer_vaccine_seq.fa""","""ENA|DD349575|DD349575.1""","""TACCCATACGACGTCCCAGACTACGCTGGT…","""TACCCCTACGACGTGCCTGACTACGCTGGA…",""""""
"""ena_cancer_vaccine_seq.fa""","""ENA|DD349578|DD349578.1""","""ATGGACTGGACCTGGAGGGTCTTCTTCTTG…","""""","""Sequence out of frame (length …"
"""ena_cancer_vaccine_seq.fa""","""ENA|DD349569|DD349569.1""","""AACAGCGAGGCCTGCCGGGACGGCCTTCGG…","""AACAGCGAGGCCTGCAGAGACGGCCTGAGA…",""""""
…,…,…,…,…
"""iedb_antigen_aa_seqs.fa""","""sp|Q8IX19|MCEM1_HUMAN Mast cel…","""ATGGAGGTGGAGGAGATCTACAAGCACCAG…","""ATGGAGGTGGAGGAGATCTACAAGCACCAG…",""""""
"""iedb_antigen_aa_seqs.fa""","""sp|Q9D287|SPF27_MOUSE Pre-mRNA…","""ATGGCCGGCACCGGCCTGGTGGCCGGCGAG…","""ATGGCCGGAACAGGCCTGGTGGCTGGCGAG…",""""""
"""iedb_antigen_aa_seqs.fa""","""tr|Q4CN05|Q4CN05_TRYCC Trans-s…","""ATGAGCAGACACCTGTTCTACAGCGCCGTG…","""ATGAGCAGACACCTGTTCTACAGCGCCGTG…",""""""
"""iedb_antigen_aa_seqs.fa""","""tr|Q5NG75|Q5NG75_FRATT Amino-a…","""ATGGACAACAACCAGGACAAGCTGAAGAGA…","""ATGGACAACAACCAGGACAAGCTGAAGAGA…",""""""


In [None]:
def _mfe(sequence: str) -> float | None:
    if not sequence:
        return None

    return Sequence.from_string(sequence).minimum_free_energy.energy

with concurrent.futures.ProcessPoolExecutor() as executor:
    df = df.with_columns(
        input_mfe=pl.Series(
            list(executor.map(_mfe, df["input_sequence"].to_list()))
        ),
        output_mfe=pl.Series(
            list(executor.map(_mfe, df["output_sequence"].to_list()))
        ),
    )

df.write_csv(output_dir / "2.1-hydrate-mfe.csv")