In [1]:
import concurrent.futures

import polars as pl

from tools.sequence.sequence import Sequence

In [2]:
sequences = pl.read_excel("./adr8470_Suppl. Excel_seq2_v1.xlsx", sheet_name="Sequences")
sequences

Protein,Natural,GEMORNA,CAI-optimized,LinearDesign (lambda=1),Random
str,str,str,str,str,str
"""EVVMTQTPVSLPVTLGEPASISCKASQSLL…","""GAGGUUGUGAUGACGCAGACCCCAGUGUCC…","""GAGGUGGUGAUGACCCAGACCCCCGUGUCC…","""GAAGUGGUGAUGACCCAGACCCCAGUGUCU…","""GAGGUGGUGAUGACCCAGACCCCGGUGAGC…","""GAAGUGGUCAUGACGCAGACUCCAGUCUCA…"
"""MRLSCPRAPGHGWMGLFLPHLPPSHNSSSG…","""AUGAGGCUAAGUUGCCCCAGAGCCCCAGGA…","""AUGAGGCUGAGCUGCCCCAGGGCCCCCGGC…","""AUGAGGCUGUCCUGCCCCCGGGCCCCCGGG…","""AUGCGGCUGAGCUGCCCCCGGGCACCCGGG…","""AUGCGUCUAUCGUGCCCCCGAGCACCAGGA…"
"""MSNFYEERATMIAAGDLQEFVPFGRDHCKH…","""AUGUCGAACUUCUAUGAAGAAAGGGCAACG…","""AUGAGCAACUUCUACGAGGAGCGGGCCACC…","""AUGAGUAACUUCUACGAGGAAAGGGCCACC…","""AUGUCCAACUUUUAUGAGGAGCGCGCCACA…","""AUGAGCAAUUUUUACGAAGAGCGCGCCACG…"
"""MPLAGLHLLAAKQAQFLSGPLPLVCSMGDG…","""AUGCCGCUGGCUGGACUGCACCUUCUGGCA…","""AUGCCCCUGGCCGGCCUGCACCUGCUGGCC…","""AUGCCUCUGGCCGGCCUGCACCUGCUGGCC…","""AUGCCCCUGGCAGGACUGCACCUGCUUGCC…","""AUGCCAUUGGCCGGACUACAUCUCCUAGCA…"
"""MSSKMAISSDIGQARRAVEQLRMEAGIDRV…","""AUGUCCAGCAAGAUGGCCAUCAGCAGCGAC…","""AUGUCCAGCAAGAUGGCCAUCAGCAGCGAC…","""AUGAGCUCCAAGAUGGCCAUCUCCUCAGAC…","""AUGAGCAGCAAGAUGGCGAUCAGCUCCGAC…","""AUGUCCAGUAAAAUGGCAAUUUCGAGCGAC…"
…,…,…,…,…,…
"""MRVLRVSQPLVRSFSSTGRSRFENRVAEKQ…","""AUGAGGGUCCUGCGGGUCUCCCAGCCCUUG…","""AUGAGGGUGCUGAGGGUGAGCCAGCCCCUG…","""AUGAGAGUGCUGAGGGUUUCCCAGCCCCUG…","""AUGCGGGUGCUGAGGGUGUCCCAGCCACUG…","""AUGCGAGUUUUACGAGUGUCCCAACCGCUG…"
"""MARTKQTARKSTGGKAPRKQLATKAARKSA…","""AUGGCUCGUACUAAGCAGACCGCCCGCAAG…","""AUGGCGCGCACCAAGCAGACCGCCCGCAAG…","""AUGGCCCGGACCAAGCAGACAGCCAGAAAG…","""AUGGCGCGGACUAAGCAGACGGCCCGCAAG…","""AUGGCGAGAACUAAGCAGACAGCUAGAAAA…"
"""MKIFYHFFHFLCCVTFILSATCSFVEPDRC…","""AUGAAGAUCUUUUACCACUUCUUCCAUUUU…","""AUGAAGAUCUUCUACCACUUCUUCCACUUC…","""AUGAAGAUCUUUUACCACUUCUUUCACUUC…","""AUGAAGAUCUUCUAUCAUUUCUUCCACUUC…","""AUGAAGAUCUUCUAUCAUUUCUUCCAUUUC…"
"""MAPSTVAVELLSPKEKNRLRKPVVEKMRRD…","""AUGGCCCCUAGCACCGUGGCCGUGGAGCUG…","""AUGGCGCCCAGCACCGUGGCCGUGGAGCUG…","""AUGGCCCCAAGCACCGUGGCCGUGGAGCUG…","""AUGGCGCCUUCUACUGUGGCCGUGGAGCUC…","""AUGGCACCAUCGACUGUGGCCGUAGAAUUA…"


In [5]:
def _process(protein: str) -> tuple[str | None, float | None]:
    optimized_sequence, mfe = None, None
    try:
        s = Sequence.from_string(protein)
        optimize_result = s.optimize()
        if optimize_result.success:
            optimized_sequence = str(optimize_result.result.sequence)
            mfe = optimize_result.result.sequence.minimum_free_energy.energy
    except Exception as e:
        print(e)
    return optimized_sequence, mfe

with concurrent.futures.ProcessPoolExecutor() as executor:
    results = list(executor.map(_process, sequences["Protein"].to_list()))

In [10]:
df = pl.DataFrame({
    "mRNArchitect": [it[0] for it in results],
    "MFE": [it[1] for it in results],
})
df

mRNArchitect,MFE
str,f64
"""GAGGTGGTGATGACCCAGACCCCCGTGAGC…",-107.199997
"""ATGAGACTGAGCTGCCCTAGAGCCCCCGGC…",-117.800003
"""ATGAGCAACTTCTACGAGGAGAGAGCCACC…",-179.699997
"""ATGCCCCTGGCCGGCCTGCACCTGCTGGCT…",-139.899994
"""ATGAGCAGCAAGATGGCCATCAGCAGCGAC…",-72.900002
…,…
"""ATGAGAGTGCTGAGAGTGAGCCAGCCCCTG…",-86.099998
"""ATGGCCAGAACCAAGCAGACCGCCAGGAAG…",-137.199997
"""ATGAAGATCTTCTACCACTTCTTCCACTTC…",-58.299999
"""ATGGCCCCCAGCACCGTGGCCGTGGAGCTG…",-181.100006


In [17]:
METRICS = {
    "CAI": pl.col("mRNArchitect").map_elements(lambda x: Sequence.from_string(x).codon_adaptation_index(), return_dtype=pl.Float64),
    "GC content": pl.col("mRNArchitect").map_elements(lambda x: Sequence.from_string(x).gc_ratio, return_dtype=pl.Float64),
    "Rare codon per nt": pl.lit(0.0),
    "U percentage":pl.col("mRNArchitect").map_elements(lambda x: Sequence.from_string(x).t_ratio, return_dtype=pl.Float64),
    "Normalized MFE":    (pl.col("MFE") / pl.col("mRNArchitect").str.len_bytes()),
    "Unwanted codon pair per nt": pl.lit(0.0),
    "Slippery site per nt": pl.lit(0.0),
    "Naturalness": pl.lit(0.0),
}

def _column_name(source, metric):
    return f"{source} - {metric}"

df2 = df.with_columns(
    *[
        column.alias(_column_name("mRNArchitect", name))
        for name, column in METRICS.items()
    ]
).drop("MFE")
df2

mRNArchitect,mRNArchitect - CAI,mRNArchitect - GC content,mRNArchitect - Rare codon per nt,mRNArchitect - U percentage,mRNArchitect - Normalized MFE,mRNArchitect - Unwanted codon pair per nt,mRNArchitect - Slippery site per nt,mRNArchitect - Naturalness
str,f64,f64,f64,f64,f64,f64,f64,f64
"""GAGGTGGTGATGACCCAGACCCCCGTGAGC…",0.995524,0.647975,0.0,0.130841,-0.333956,0.0,0.0,0.0
"""ATGAGACTGAGCTGCCCTAGAGCCCCCGGC…",0.977057,0.661247,0.0,0.165312,-0.319241,0.0,0.0,0.0
"""ATGAGCAACTTCTACGAGGAGAGAGCCACC…",0.98165,0.626068,0.0,0.132479,-0.383974,0.0,0.0,0.0
"""ATGCCCCTGGCCGGCCTGCACCTGCTGGCT…",0.985818,0.649123,0.0,0.140351,-0.409064,0.0,0.0,0.0
"""ATGAGCAGCAAGATGGCCATCAGCAGCGAC…",1.0,0.639269,0.0,0.118721,-0.332877,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…
"""ATGAGAGTGCTGAGAGTGAGCCAGCCCCTG…",1.0,0.609053,0.0,0.148148,-0.354321,0.0,0.0,0.0
"""ATGGCCAGAACCAAGCAGACCGCCAGGAAG…",0.988222,0.618005,0.0,0.111922,-0.33382,0.0,0.0,0.0
"""ATGAAGATCTTCTACCACTTCTTCCACTTC…",0.998628,0.557143,0.0,0.190476,-0.277619,0.0,0.0,0.0
"""ATGGCCCCCAGCACCGTGGCCGTGGAGCTG…",0.958454,0.638211,0.0,0.128049,-0.368089,0.0,0.0,0.0


In [18]:
df2.write_csv("mRNArchitect.csv")