In [2]:
import concurrent.futures
import typing

import polars as pl

from mrnarchitect.sequence.optimize import optimize, DEFAULT_OPTIMIZATION_PARAMETER
from mrnarchitect.sequence import Sequence

In [3]:
def _optimize(data: dict) -> dict:
    if data["index"] % 1000 == 0:
        print(f"index: {data['index']}")
    
    if data["error"] or not data["input_sequence"]:
        return {
            **data,
            "optimized_sequence": None,
            "optimization_error": "Input sequence is not valid."
        }
    result = optimize(
        Sequence.create(data["input_sequence"]),
        parameters=[DEFAULT_OPTIMIZATION_PARAMETER],
    )
    if result.success:
        output_sequence = str(result.result.sequence)
        optimization_error = None
    else:
        output_sequence = None
        optimization_error = result.error.message
    return {
        **data,
        "optimized_sequence": output_sequence,
        "optimization_error": optimization_error,
    }

input_sequences = pl.read_csv("input-sequences.csv")
with concurrent.futures.ProcessPoolExecutor() as executor:
    results = executor.map(
        _optimize,
        list(input_sequences.rows(named=True))
    )
    optimized_sequences = pl.DataFrame(
        results,
        schema_overrides={
            "error": pl.String | None,
            "optimized_sequence": pl.String | None,
            "optimization_error": pl.String | None,
        },
        infer_schema_length=10_000,
    )
optimized_sequences  
    
    

index: 0
index: 48000
index: 49000
index: 50000
index: 1000
index: 2000
index: 3000
index: 4000
index: 5000
index: 6000
index: 7000
index: 8000
index: 9000
index: 10000
index: 11000
index: 12000
index: 13000
index: 14000
index: 15000
index: 16000
index: 17000
index: 18000
index: 19000
index: 20000
index: 21000
index: 22000
index: 23000
index: 24000
index: 25000
index: 26000
index: 27000
index: 28000
index: 29000
index: 30000
index: 31000
index: 32000
index: 33000
index: 34000
index: 35000
index: 36000
index: 37000
index: 38000
index: 39000
index: 40000
index: 41000
index: 42000
index: 43000
index: 44000
index: 45000
index: 46000
index: 47000


index,source,name,raw_input_sequence,input_sequence,error,optimized_sequence,optimization_error
i64,str,str,str,str,null,str,str
0,"""antibody_monoclonal_aa_seqs.fa…","""RituximabTargetAnti-CD20v_heav…","""QVQLQQPGAELVKPGASVKMSCKASGYTFT…","""CAGGTGCAGCTGCAGCAGCCCGGCGCCGAG…",,"""CAGGTGCAGCTCCAGCAGCCAGGCGCCGAA…",
1,"""antibody_monoclonal_aa_seqs.fa…","""RituximabTargetAnti-CD20v_ligh…","""QIVLSQSPAILSASPGEKVTMTCRASSSVS…","""CAGATCGTGCTGAGCCAGAGCCCCGCCATC…",,"""CAGATCGTGCTGAGCCAGAGCCCCGCCATC…",
2,"""antibody_monoclonal_aa_seqs.fa…","""TrastuzumabTargetAnti-HER2_Lig…","""DIQMTQSPSSLSASVGDRVTITCRASQDVN…","""GACATCCAGATGACCCAGAGCCCCAGCAGC…",,"""GACATCCAGATGACCCAGAGCCCCAGCTCC…",
3,"""antibody_monoclonal_aa_seqs.fa…","""TrastuzumabTargetAnti-HER2_Hea…","""EVQLVESGGGLVQPGGSLRLSCAASGFNIK…","""GAGGTGCAGCTGGTGGAGAGCGGCGGCGGC…",,"""GAGGTGCAGCTGGTGGAGAGCGGCGGCGGC…",
4,"""antibody_monoclonal_aa_seqs.fa…","""Bevacizumab_light_chain""","""DIQMTQSPSSLSASVGDRVTITCSASQDIS…","""GACATCCAGATGACCCAGAGCCCCAGCAGC…",,"""GACATCCAGATGACCCAGAGCCCCAGCTCC…",
…,…,…,…,…,…,…,…
47568,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000173846|ENSG0000017384…","""ATGGAGCCTGCCGCCGGTTTCCTGTCTCCG…","""ATGGAGCCTGCCGCCGGTTTCCTGTCTCCG…",,"""ATGGAGCCTGCCGCTGGCTTTCTGTCTCCT…",
47569,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000142166|ENSG0000014216…","""ATGGATAATTGGATAAAATTGTCTGGGTGT…","""ATGGATAATTGGATAAAATTGTCTGGGTGT…",,"""ATGGACAACTGGATCAAGCTGAGCGGCTGC…",
47570,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000180488|ENSG0000018048…","""ATGTCAGACTGCTGCTCAGCGCCAGGCATC…","""ATGTCAGACTGCTGCTCAGCGCCAGGCATC…",,"""ATGAGCGACTGCTGCAGCGCCCCCGGCATC…",
47571,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000145216|ENSG0000014521…","""ATGTCGGCCGGCGAGGTCGAGCGCCTAGTG…","""ATGTCGGCCGGCGAGGTCGAGCGCCTAGTG…",,"""ATGAGCGCCGGCGAGGTCGAGAGACTGGTG…",


In [4]:
optimized_sequences.write_csv("optimized-sequences.csv")