In [17]:
import concurrent.futures
import typing

import polars as pl

from mrnarchitect.sequence.optimize import optimize, DEFAULT_OPTIMIZATION_PARAMETER
from mrnarchitect.sequence import Sequence

In [19]:
def _optimize(data: dict) -> dict:
    
    if data["error"] or not data["input_sequence"]:
        return {
            **data,
            "optimized_sequence": None,
            "optimization_error": "Input sequence is not valid."
        }
    result = optimize(
        Sequence.create(data["input_sequence"]),
        parameters=[DEFAULT_OPTIMIZATION_PARAMETER],
    )
    if result.success:
        output_sequence = str(result.result.sequence)
        optimization_error = None
    else:
        output_sequence = None
        optimization_error = result.error.message
    return {
        **data,
        "optimized_sequence": output_sequence,
        "optimization_error": optimization_error,
    }

input_sequences = pl.read_csv("input-sequences.csv")
with concurrent.futures.ProcessPoolExecutor() as executor:
    results = executor.map(
        _optimize,
        list(input_sequences.rows(named=True))
    )
    optimized_sequences = pl.DataFrame(
        results,
        schema_overrides={
            "error": pl.String | None,
            "optimized_sequence": pl.String | None,
            "optimization_error": pl.String | None,
        },
        infer_schema_length=10_000,
    )
optimized_sequences  
    
    

index,source,name,raw_input_sequence,input_sequence,error,optimized_sequence,optimization_error
i64,str,str,str,str,null,str,str
4721,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000115947|ENSG0000011594…","""ATGAGCAGTCGTAAATCAAAGAGTAACAGC…","""ATGAGCAGTCGTAAATCAAAGAGTAACAGC…",,"""ATGAGCAGCAGAAAGAGCAAGAGCAACAGC…",
39774,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000160953|ENSG0000016095…","""ATGGCGGATGCCAAGTATGTCCTCTGCCGA…","""ATGGCGGATGCCAAGTATGTCCTCTGCCGA…",,"""ATGGCCGACGCCAAGTACGTGCTGTGCAGA…",
27306,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000112787|ENSG0000011278…","""ATGGAGGCCAAGGTCCGCCCGAGCCGGCGC…","""ATGGAGGCCAAGGTCCGCCCGAGCCGGCGC…",,"""ATGGAGGCCAAGGTGAGACCCAGCAGAAGA…",
27148,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000198885|ENSG0000019888…","""ATGAATGTTGATGCAGAGGCCTCCATGGCT…","""ATGAATGTTGATGCAGAGGCCTCCATGGCT…",,"""ATGAACGTGGACGCCGAGGCCAGCATGGCC…",
20254,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000204410|ENSG0000020441…","""ATGGCCTCCTTAGGAGCGAACCCAAGGAGG…","""ATGGCCTCCTTAGGAGCGAACCCAAGGAGG…",,"""ATGGCCAGCCTCGGAGCCAACCCTAGAAGA…",
…,…,…,…,…,…,…,…
50708,"""iedb_antigen_aa_seqs.fa""","""sp|Q8IX19|MCEM1_HUMAN Mast cel…","""MEVEEIYKHQEVKMQAPAFRDKKQGVSAKN…","""ATGGAGGTGGAGGAGATCTACAAGCACCAG…",,"""ATGGAGGTGGAGGAGATCTACAAGCACCAG…",
50709,"""iedb_antigen_aa_seqs.fa""","""sp|Q9D287|SPF27_MOUSE Pre-mRNA…","""MAGTGLVAGEVVVDALPYFDQGYEAPGVRE…","""ATGGCCGGCACCGGCCTGGTGGCCGGCGAG…",,"""ATGGCCGGCACAGGCCTGGTGGCTGGCGAG…",
50710,"""iedb_antigen_aa_seqs.fa""","""tr|Q4CN05|Q4CN05_TRYCC Trans-s…","""MSRHLFYSAVLLLLVVMCCGTAAVNAEELS…","""ATGAGCAGACACCTGTTCTACAGCGCCGTG…",,"""ATGAGCAGACACCTGTTCTACAGCGCCGTG…",
50711,"""iedb_antigen_aa_seqs.fa""","""tr|Q5NG75|Q5NG75_FRATT Amino-a…","""MDNNQDKLKRDILSRHIVMISLGGTISASF…","""ATGGACAACAACCAGGACAAGCTGAAGAGA…",,"""ATGGACAACAACCAGGACAAGCTGAAGAGA…",


In [20]:
optimized_sequences.write_csv("optimized-sequences.csv")