In [7]:
import polars as pl

from mrnarchitect.utils.fasta import parse_fasta_file

In [8]:
df = pl.read_csv("optimized-sequences.csv")
df

index,source,name,raw_input_sequence,input_sequence,error,optimized_sequence,optimization_error
i64,str,str,str,str,str,str,str
4721,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000115947|ENSG0000011594…","""ATGAGCAGTCGTAAATCAAAGAGTAACAGC…","""ATGAGCAGTCGTAAATCAAAGAGTAACAGC…",,"""ATGAGCAGCAGAAAGAGCAAGAGCAACAGC…",
39774,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000160953|ENSG0000016095…","""ATGGCGGATGCCAAGTATGTCCTCTGCCGA…","""ATGGCGGATGCCAAGTATGTCCTCTGCCGA…",,"""ATGGCCGACGCCAAGTACGTGCTGTGCAGA…",
27306,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000112787|ENSG0000011278…","""ATGGAGGCCAAGGTCCGCCCGAGCCGGCGC…","""ATGGAGGCCAAGGTCCGCCCGAGCCGGCGC…",,"""ATGGAGGCCAAGGTGAGACCCAGCAGAAGA…",
27148,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000198885|ENSG0000019888…","""ATGAATGTTGATGCAGAGGCCTCCATGGCT…","""ATGAATGTTGATGCAGAGGCCTCCATGGCT…",,"""ATGAACGTGGACGCCGAGGCCAGCATGGCC…",
20254,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000204410|ENSG0000020441…","""ATGGCCTCCTTAGGAGCGAACCCAAGGAGG…","""ATGGCCTCCTTAGGAGCGAACCCAAGGAGG…",,"""ATGGCCAGCCTCGGAGCCAACCCTAGAAGA…",
…,…,…,…,…,…,…,…
50708,"""iedb_antigen_aa_seqs.fa""","""sp|Q8IX19|MCEM1_HUMAN Mast cel…","""MEVEEIYKHQEVKMQAPAFRDKKQGVSAKN…","""ATGGAGGTGGAGGAGATCTACAAGCACCAG…",,"""ATGGAGGTGGAGGAGATCTACAAGCACCAG…",
50709,"""iedb_antigen_aa_seqs.fa""","""sp|Q9D287|SPF27_MOUSE Pre-mRNA…","""MAGTGLVAGEVVVDALPYFDQGYEAPGVRE…","""ATGGCCGGCACCGGCCTGGTGGCCGGCGAG…",,"""ATGGCCGGCACAGGCCTGGTGGCTGGCGAG…",
50710,"""iedb_antigen_aa_seqs.fa""","""tr|Q4CN05|Q4CN05_TRYCC Trans-s…","""MSRHLFYSAVLLLLVVMCCGTAAVNAEELS…","""ATGAGCAGACACCTGTTCTACAGCGCCGTG…",,"""ATGAGCAGACACCTGTTCTACAGCGCCGTG…",
50711,"""iedb_antigen_aa_seqs.fa""","""tr|Q5NG75|Q5NG75_FRATT Amino-a…","""MDNNQDKLKRDILSRHIVMISLGGTISASF…","""ATGGACAACAACCAGGACAAGCTGAAGAGA…",,"""ATGGACAACAACCAGGACAAGCTGAAGAGA…",


In [9]:
def _add_stop_codon(s: str) -> str:
    _STOP_CODONS = ["TAG", "TAA", "TGA"]
    if s[-3:] not in _STOP_CODONS:
        return s + _STOP_CODONS[0]
    return s

def _format_for_ribonn_input(df: pl.DataFrame, sequence_key: str) -> pl.DataFrame:
    # Human alpha-globin
    FIVE_PRIME_UTR = "ACTCTTCTGGTCCCCACAGACTCAGAGAGAACCCACC"
    # Human alpha-globin
    THREE_PRIME_UTR = "GCTGGAGCCTCGGTGGCCATGCTTCTTGCCCCTTGGGCCTCCCCCCAGCCCCTCCTCCCCTTCCTGCACCCGTACCCCCGTGGTCTTTGAATAAAGTCTGAGTGGGCGGCA"

    num_rows = df.select(pl.len()).item()
    
    ribonn_input = pl.DataFrame({
        "tx_id": df["index"].to_list(),
        "utr5_sequence": [FIVE_PRIME_UTR] * num_rows,
        "cds_sequence": [_add_stop_codon(s) for s in df[sequence_key].to_list()],
        "utr3_sequence": [THREE_PRIME_UTR] * num_rows,
    })

    return ribonn_input

In [13]:
ribonn_input = _format_for_ribonn_input(df.filter(pl.col("input_sequence").is_not_null()), "input_sequence")
ribonn_input.write_csv("ribonn-prediction-input-input-sequences.tsv", separator="\t")
ribonn_input

tx_id,utr5_sequence,cds_sequence,utr3_sequence
i64,str,str,str
4721,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGAGCAGTCGTAAATCAAAGAGTAACAGC…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
39774,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGGCGGATGCCAAGTATGTCCTCTGCCGA…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
27306,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGGAGGCCAAGGTCCGCCCGAGCCGGCGC…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
27148,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGAATGTTGATGCAGAGGCCTCCATGGCT…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
20254,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGGCCTCCTTAGGAGCGAACCCAAGGAGG…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
…,…,…,…
50708,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGGAGGTGGAGGAGATCTACAAGCACCAG…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
50709,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGGCCGGCACCGGCCTGGTGGCCGGCGAG…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
50710,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGAGCAGACACCTGTTCTACAGCGCCGTG…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
50711,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGGACAACAACCAGGACAAGCTGAAGAGA…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"


In [14]:
ribonn_optimized = _format_for_ribonn_input(df.filter(pl.col("optimized_sequence").is_not_null()), "optimized_sequence")
ribonn_optimized.write_csv("ribonn-prediction-input-optimized-sequences.tsv", separator="\t")
ribonn_optimized

tx_id,utr5_sequence,cds_sequence,utr3_sequence
i64,str,str,str
4721,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGAGCAGCAGAAAGAGCAAGAGCAACAGC…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
39774,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGGCCGACGCCAAGTACGTGCTGTGCAGA…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
27306,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGGAGGCCAAGGTGAGACCCAGCAGAAGA…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
27148,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGAACGTGGACGCCGAGGCCAGCATGGCC…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
20254,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGGCCAGCCTCGGAGCCAACCCTAGAAGA…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
…,…,…,…
50708,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGGAGGTGGAGGAGATCTACAAGCACCAG…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
50709,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGGCCGGCACAGGCCTGGTGGCTGGCGAG…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
50710,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGAGCAGACACCTGTTCTACAGCGCCGTG…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
50711,"""ACTCTTCTGGTCCCCACAGACTCAGAGAGA…","""ATGGACAACAACCAGGACAAGCTGAAGAGA…","""GCTGGAGCCTCGGTGGCCATGCTTCTTGCC…"
