In [1]:
import concurrent.futures
import polars as pl

from tools.sequence.sequence import Sequence

In [8]:
df = pl.read_csv("raw-dataset.csv")
df


source,name,sequence
str,str,str
"""antibody_monoclonal_aa_seqs.fa…","""RituximabTargetAnti-CD20v_heav…","""CAGGTGCAGCTGCAGCAGCCCGGCGCCGAG…"
"""antibody_monoclonal_aa_seqs.fa…","""RituximabTargetAnti-CD20v_ligh…","""CAGATCGTGCTGAGCCAGAGCCCCGCCATC…"
"""antibody_monoclonal_aa_seqs.fa…","""TrastuzumabTargetAnti-HER2_Lig…","""GACATCCAGATGACCCAGAGCCCCAGCAGC…"
"""antibody_monoclonal_aa_seqs.fa…","""TrastuzumabTargetAnti-HER2_Hea…","""GAGGTGCAGCTGGTGGAGAGCGGCGGCGGC…"
"""antibody_monoclonal_aa_seqs.fa…","""Bevacizumab_light_chain""","""GACATCCAGATGACCCAGAGCCCCAGCAGC…"
…,…,…
"""iedb_antigen_aa_seqs.fa""","""sp|Q8IX19|MCEM1_HUMAN Mast cel…","""ATGGAGGTGGAGGAGATCTACAAGCACCAG…"
"""iedb_antigen_aa_seqs.fa""","""sp|Q9D287|SPF27_MOUSE Pre-mRNA…","""ATGGCCGGCACCGGCCTGGTGGCCGGCGAG…"
"""iedb_antigen_aa_seqs.fa""","""tr|Q4CN05|Q4CN05_TRYCC Trans-s…","""ATGAGCAGACACCTGTTCTACAGCGCCGTG…"
"""iedb_antigen_aa_seqs.fa""","""tr|Q5NG75|Q5NG75_FRATT Amino-a…","""ATGGACAACAACCAGGACAAGCTGAAGAGA…"


In [None]:
def _mfe(s: str):
    if len(s) > 7500:
        return None
    try:
        return Sequence.create(s).minimum_free_energy
    except:
        return None

def _pseudo_mfe(s: str):
    try:
        return Sequence.create(s).pseudo_minimum_free_energy
    except:
        return None

def _windowed_mfe(s: str):
    try:
        return Sequence.create(s).windowed_minimum_free_energy()
    except Exception as e:
        return None

with concurrent.futures.ProcessPoolExecutor() as executor:
    mfes = list(executor.map(_mfe, df["sequence"].to_list()))
    pseudo_mfes = list(executor.map(_pseudo_mfe, df["sequence"].to_list()))
    windowed_mfes = list(executor.map(_windowed_mfe, df["sequence"].to_list()))

df = df.with_columns(
    pl.Series("MFE", [it.energy if it else None for it in mfes]),
    pl.Series("Average MFE", [it.average_energy if it else None for it in mfes]),
    pl.Series("MFE Structure", [it.structure if it else None for it in mfes]),
    pl.Series("Pseudo-MFE", pseudo_mfes),
    pl.Series("Windowed MFE - Mean", [it.mean_energy if it else None for it in windowed_mfes]),
    pl.Series("Windowed MFE - Standard Deviation", [it.standard_deviation if it else None for it in windowed_mfes]),
)
df

In [4]:
df.write_csv("sequences-with-mfe.csv")