In [5]:
import concurrent.futures
import csv
import os
import pathlib

import polars as pl
import plotnine as p9

from tools.sequence import Sequence

In [8]:
def _mfe(s: str) -> float | None:
    try:
        seq = Sequence.from_string(s)
        return seq.minimum_free_energy.energy
    except:
        return None
    

with open(pathlib.Path(os.getcwd()) / "sequences.csv", "r") as f:
    sequences =  pl.DataFrame(list(csv.DictReader(f)))

with concurrent.futures.ProcessPoolExecutor() as executor:
    sequences = sequences.with_columns(
        natural_mfe=pl.Series(list(executor.map(_mfe, sequences["Natural"].to_list()))),
        gemorna_mfe=pl.Series(list(executor.map(_mfe, sequences["GEMORNA"].to_list()))),
        cai_mfe=pl.Series(list(executor.map(_mfe, sequences["CAI-optimized"].to_list()))),
        lineardesign_mfe=pl.Series(list(executor.map(_mfe, sequences["LinearDesign (lambda=1)"].to_list()))),
        random_mfe=pl.Series(list(executor.map(_mfe, sequences["Random"].to_list()))),
    )


In [18]:
normalized_sequences = sequences.with_columns(
    gemorna_mfe=pl.col("genorma_mfe")
).with_columns(
    natural_mfe_normalized=pl.col("natural_mfe") / pl.col("Natural").str.len_bytes(),
    gemorna_mfe_normalized=pl.col("gemorna_mfe") / pl.col("GEMORNA").str.len_bytes(),
    cai_mfe_normalized=pl.col("cai_mfe") / pl.col("CAI-optimized").str.len_bytes(),
    lineardesign_mfe_normalized=pl.col("lineardesign_mfe") / pl.col("LinearDesign (lambda=1)").str.len_bytes(),
    random_mfe_normalized=pl.col("random_mfe") / pl.col("Random").str.len_bytes(),
)
normalized_sequences.write_csv("mfe.csv")