In [1]:
import os
import subprocess
import tempfile

import polars as pl

from mrnarchitect.utils.fasta import parse_fasta_file

In [2]:
df = pl.read_csv("optimized-sequences.csv")
df

index,source,name,raw_input_sequence,input_sequence,error,optimized_sequence,optimization_error
i64,str,str,str,str,str,str,str
0,"""antibody_monoclonal_aa_seqs.fa…","""RituximabTargetAnti-CD20v_heav…","""QVQLQQPGAELVKPGASVKMSCKASGYTFT…","""CAGGTGCAGCTGCAGCAGCCCGGCGCCGAG…",,"""CAGGTGCAGCTCCAGCAGCCAGGCGCCGAA…",
1,"""antibody_monoclonal_aa_seqs.fa…","""RituximabTargetAnti-CD20v_ligh…","""QIVLSQSPAILSASPGEKVTMTCRASSSVS…","""CAGATCGTGCTGAGCCAGAGCCCCGCCATC…",,"""CAGATCGTGCTGAGCCAGAGCCCCGCCATC…",
2,"""antibody_monoclonal_aa_seqs.fa…","""TrastuzumabTargetAnti-HER2_Lig…","""DIQMTQSPSSLSASVGDRVTITCRASQDVN…","""GACATCCAGATGACCCAGAGCCCCAGCAGC…",,"""GACATCCAGATGACCCAGAGCCCCAGCTCC…",
3,"""antibody_monoclonal_aa_seqs.fa…","""TrastuzumabTargetAnti-HER2_Hea…","""EVQLVESGGGLVQPGGSLRLSCAASGFNIK…","""GAGGTGCAGCTGGTGGAGAGCGGCGGCGGC…",,"""GAGGTGCAGCTGGTGGAGAGCGGCGGCGGC…",
4,"""antibody_monoclonal_aa_seqs.fa…","""Bevacizumab_light_chain""","""DIQMTQSPSSLSASVGDRVTITCSASQDIS…","""GACATCCAGATGACCCAGAGCCCCAGCAGC…",,"""GACATCCAGATGACCCAGAGCCCCAGCTCC…",
…,…,…,…,…,…,…,…
47568,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000173846|ENSG0000017384…","""ATGGAGCCTGCCGCCGGTTTCCTGTCTCCG…","""ATGGAGCCTGCCGCCGGTTTCCTGTCTCCG…",,"""ATGGAGCCTGCCGCTGGCTTTCTGTCTCCT…",
47569,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000142166|ENSG0000014216…","""ATGGATAATTGGATAAAATTGTCTGGGTGT…","""ATGGATAATTGGATAAAATTGTCTGGGTGT…",,"""ATGGACAACTGGATCAAGCTGAGCGGCTGC…",
47570,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000180488|ENSG0000018048…","""ATGTCAGACTGCTGCTCAGCGCCAGGCATC…","""ATGTCAGACTGCTGCTCAGCGCCAGGCATC…",,"""ATGAGCGACTGCTGCAGCGCCCCCGGCATC…",
47571,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000145216|ENSG0000014521…","""ATGTCGGCCGGCGAGGTCGAGCGCCTAGTG…","""ATGTCGGCCGGCGAGGTCGAGCGCCTAGTG…",,"""ATGAGCGCCGGCGAGGTCGAGAGACTGGTG…",


In [3]:
def _add_stop_codon(s: str) -> str:
    _STOP_CODONS = ["TAG", "TAA", "TGA"]
    if s[-3:] not in _STOP_CODONS:
        return s + _STOP_CODONS[0]
    return s

def _format_for_ribonn_input(df: pl.DataFrame, sequence_key: str) -> pl.DataFrame:
    # Human alpha-globin
    FIVE_PRIME_UTR = "ACTCTTCTGGTCCCCACAGACTCAGAGAGAACCCACC"
    # Human alpha-globin
    THREE_PRIME_UTR = "GCTGGAGCCTCGGTGGCCATGCTTCTTGCCCCTTGGGCCTCCCCCCAGCCCCTCCTCCCCTTCCTGCACCCGTACCCCCGTGGTCTTTGAATAAAGTCTGAGTGGGCGGCA"

    num_rows = df.select(pl.len()).item()
    
    ribonn_input = pl.DataFrame({
        "tx_id": df["index"].to_list(),
        "utr5_sequence": [FIVE_PRIME_UTR] * num_rows,
        "cds_sequence": [_add_stop_codon(s) for s in df[sequence_key].to_list()],
        "utr3_sequence": [THREE_PRIME_UTR] * num_rows,
    })

    return ribonn_input

def _ribonn(df: pl.DataFrame, type_: str) -> pl.DataFrame:
    _SLICE_SIZE = 1000
    input_df = _format_for_ribonn_input(df, f"{type_}_sequence")
    input_length = input_df.select(pl.len()).item()

    output_dfs = []
    for i in range(0, input_length, _SLICE_SIZE):
        print(i, i + _SLICE_SIZE)
        slice_ = input_df.slice(i, _SLICE_SIZE)
        with tempfile.TemporaryDirectory(delete=False) as temp_dir:
            slice_.write_csv(
                temp_dir + "/prediction_input1.txt",
                separator="\t",
            )
            result = subprocess.run(
                [
                    "docker",
                    "run",
                    "--volume",
                    f"{temp_dir}:/opt/RiboNN/data",
                    "--volume",
                    f"{temp_dir}:/opt/RiboNN/results",
                    "--shm-size=2g",
                    "ribonn:latest",
                    "make",
                    "predict_human",
                ],
                stderr=subprocess.DEVNULL,
                stdout=subprocess.DEVNULL,
            )
            prediction_output = pl.read_csv(
                temp_dir + "/human/prediction_output.txt",
                separator="\t",
            ).insert_column(
                0, pl.col("tx_id").alias("index")
            ).drop(
                ["tx_id", "utr5_sequence", "cds_sequence", "utr3_sequence"]
            ).rename(lambda s: s if s == "index" else f"{type_}_{s}")
            output_dfs.append(prediction_output)
    return pl.concat(output_dfs, how="align")

input_output = _ribonn(df, "input")
optimized_output = _ribonn(df.filter(pl.col("optimized_sequence").is_not_null()), "optimized")
output = pl.concat([input_output, optimized_output], how="align")
output.write_csv("output-ribonn.csv")
output

   

0 1000
1000 2000
2000 3000
3000 4000
4000 5000
5000 6000
6000 7000
7000 8000
8000 9000
9000 10000
10000 11000
11000 12000
12000 13000
13000 14000
14000 15000
15000 16000
16000 17000
17000 18000
18000 19000
19000 20000
20000 21000
21000 22000
22000 23000
23000 24000
24000 25000
25000 26000
26000 27000
27000 28000
28000 29000
29000 30000
30000 31000
31000 32000
32000 33000
33000 34000
34000 35000
35000 36000
36000 37000
37000 38000
38000 39000
39000 40000
40000 41000
41000 42000
42000 43000
43000 44000
44000 45000
45000 46000
46000 47000
47000 48000
48000 49000
49000 50000
50000 51000
0 1000
1000 2000
2000 3000
3000 4000
4000 5000
5000 6000
6000 7000
7000 8000
8000 9000
9000 10000
10000 11000
11000 12000
12000 13000
13000 14000
14000 15000
15000 16000
16000 17000
17000 18000
18000 19000
19000 20000
20000 21000
21000 22000
22000 23000
23000 24000
24000 25000
25000 26000
26000 27000
27000 28000
28000 29000
29000 30000
30000 31000
31000 32000
32000 33000
33000 34000
34000 35000
35000 36000


index,input_predicted_TE_108T,input_predicted_TE_12T,input_predicted_TE_A2780,input_predicted_TE_A549,input_predicted_TE_BJ,input_predicted_TE_BRx.142,input_predicted_TE_C643,input_predicted_TE_CRL.1634,input_predicted_TE_Calu.3,input_predicted_TE_Cybrid_Cells,input_predicted_TE_H1.hESC,input_predicted_TE_H1933,input_predicted_TE_H9.hESC,input_predicted_TE_HAP.1,input_predicted_TE_HCC_tumor,input_predicted_TE_HCC_adjancent_normal,input_predicted_TE_HCT116,input_predicted_TE_HEK293,input_predicted_TE_HEK293T,input_predicted_TE_HMECs,input_predicted_TE_HSB2,input_predicted_TE_HSPCs,input_predicted_TE_HeLa,input_predicted_TE_HeLa_S3,input_predicted_TE_HepG2,input_predicted_TE_Huh.7.5,input_predicted_TE_Huh7,input_predicted_TE_K562,input_predicted_TE_Kidney_normal_tissue,input_predicted_TE_LCL,input_predicted_TE_LuCaP.PDX,input_predicted_TE_MCF10A,input_predicted_TE_MCF10A.ER.Src,input_predicted_TE_MCF7,input_predicted_TE_MD55A3,input_predicted_TE_MDA.MB.231,…,optimized_predicted_TE_PATU.8902,optimized_predicted_TE_PC3,optimized_predicted_TE_PC9,optimized_predicted_TE_Primary_CD4._T.cells,optimized_predicted_TE_Primary_human_bronchial_epithelial_cells,optimized_predicted_TE_RD.CCL.136,optimized_predicted_TE_RPE.1,optimized_predicted_TE_SH.SY5Y,optimized_predicted_TE_SUM159PT,optimized_predicted_TE_SW480TetOnAPC,optimized_predicted_TE_T47D,optimized_predicted_TE_THP.1,optimized_predicted_TE_U.251,optimized_predicted_TE_U.343,optimized_predicted_TE_U2392,optimized_predicted_TE_U2OS,optimized_predicted_TE_Vero_6,optimized_predicted_TE_WI38,optimized_predicted_TE_WM902B,optimized_predicted_TE_WTC.11,optimized_predicted_TE_ZR75.1,optimized_predicted_TE_cardiac_fibroblasts,optimized_predicted_TE_ccRCC,optimized_predicted_TE_early_neurons,optimized_predicted_TE_fibroblast,optimized_predicted_TE_hESC,optimized_predicted_TE_human_brain_tumor,optimized_predicted_TE_iPSC.differentiated_dopamine_neurons,optimized_predicted_TE_megakaryocytes,optimized_predicted_TE_muscle_tissue,optimized_predicted_TE_neuronal_precursor_cells,optimized_predicted_TE_neurons,optimized_predicted_TE_normal_brain_tissue,optimized_predicted_TE_normal_prostate,optimized_predicted_TE_primary_macrophages,optimized_predicted_TE_skeletal_muscle,optimized_mean_predicted_TE
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,0.253488,0.614285,0.792056,0.519216,0.816046,1.1195401,2.2062094,0.503505,0.3272646,0.444982,0.844325,0.7736777,0.584712,0.5394196,0.7550019,0.6028141,1.1232446,0.741804,0.760709,-0.228514,0.3760771,0.907626,0.6758595,0.8628132,0.6503796,0.4865625,0.5931593,0.565061,0.929223,0.8871609,0.61046,0.758651,0.423212,1.1277047,0.172219,0.137008,…,0.871899,0.221591,0.5025283,0.837004,1.0401903,0.678217,0.489875,0.6546557,-0.25973,1.0453405,-0.090936,0.200537,0.836385,0.8797067,1.7117574,0.8587179,0.522966,0.292864,0.3436048,0.4736609,0.4147542,0.8005847,0.921212,0.7710437,0.778489,0.849324,0.880333,0.7517811,0.7315919,0.390224,0.818275,0.8951181,0.7017768,0.155682,1.3600357,3.4466224,0.754483
1,0.847973,1.07495,0.9959256,0.8249691,1.0605981,1.1147215,2.1278448,1.0467718,0.726657,0.8296013,1.5085331,1.231389,1.1659875,0.9865426,1.0608604,1.0074418,1.4931152,1.0974948,1.1162174,0.3199896,0.657933,1.202469,1.0578295,1.185262,0.6009719,0.692908,0.7974707,0.9909,1.3064477,1.4553355,0.9532409,1.1226814,0.8446697,1.531464,0.7379033,0.6049614,…,1.2015786,0.4993185,1.0303148,1.1817547,1.3104477,0.8318488,0.883257,1.0325949,0.090755,1.2311943,0.369411,0.3254848,1.3621538,1.4288645,1.93615,1.2426196,0.8673431,0.6951815,0.7150792,0.6101947,0.837011,1.2391798,1.2066175,0.9703537,0.9496115,1.0197175,1.2176926,0.9532372,1.2390183,0.5368302,1.0375001,1.1294162,1.059294,0.491357,1.2789897,3.8240151,1.0833449
2,0.759967,1.010431,0.968812,0.7802428,1.0205295,1.0751226,2.0653963,0.9879586,0.7017498,0.7836806,1.4175316,1.1565343,1.0778546,0.917016,1.0301692,0.9616278,1.4220483,1.0289979,1.0562563,0.253861,0.626423,1.1426154,1.0034832,1.1413226,0.5908534,0.6773013,0.7614638,0.9195811,1.2288857,1.3714561,0.9095656,1.0725663,0.8115738,1.4611595,0.652575,0.552278,…,1.1954304,0.485574,0.998297,1.1560543,1.2854921,0.8204565,0.8605016,1.0058205,0.0657,1.2133551,0.343,0.291535,1.3299663,1.3961701,1.9028994,1.2153448,0.854497,0.6739579,0.694494,0.6013955,0.8138114,1.2108796,1.185256,0.95712,0.931708,1.0089494,1.2047646,0.9360033,1.2129343,0.5173305,1.0251462,1.1166493,1.0464458,0.471589,1.2605698,3.7981772,1.0613674
3,0.107877,0.441651,0.707987,0.413888,0.696158,0.9153597,2.0552616,0.3674227,0.27275,0.328371,0.6470643,0.6047133,0.39444,0.3860386,0.7090323,0.562477,0.929283,0.591255,0.616319,-0.321924,0.3000226,0.7575221,0.553039,0.7544463,0.5929974,0.456194,0.525319,0.421807,0.731104,0.6988022,0.5081074,0.6298598,0.337637,0.9106743,0.028175,0.0429694,…,0.8593281,0.167384,0.358432,0.700687,0.9024234,0.596666,0.3680349,0.506829,-0.330879,0.927464,-0.177266,0.044738,0.6422909,0.684584,1.5439274,0.6982418,0.452758,0.186831,0.250605,0.43216,0.292467,0.6378528,0.7829395,0.681117,0.676628,0.7573891,0.7817484,0.6192398,0.586395,0.323925,0.730516,0.802656,0.6114734,0.097161,1.2853097,3.2314517,0.6384625
4,0.7358734,0.972929,0.9540945,0.7622268,0.998159,1.0200843,2.0333738,0.9609257,0.693144,0.7617453,1.380484,1.1240928,1.0416232,0.8883708,1.030685,0.9649569,1.3815012,1.001748,1.0290205,0.241546,0.61275,1.1117462,0.9815463,1.1230274,0.5780478,0.6776961,0.7521783,0.8911373,1.1892643,1.3357149,0.89299,1.0484668,0.7931306,1.4146512,0.6279057,0.538023,…,1.1998996,0.481772,0.983331,1.1406448,1.2682314,0.8105776,0.846244,0.988835,0.06319,1.2002461,0.336841,0.2714213,1.3052473,1.3706836,1.8831927,1.1961652,0.845723,0.6628633,0.6859156,0.6015581,0.7999197,1.1907533,1.1693108,0.9473254,0.9222889,0.9979569,1.1916349,0.9179659,1.1955465,0.5148094,1.0156183,1.1068051,1.0353965,0.4716111,1.2532126,3.7693012,1.0488142
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
50708,1.0706306,1.170573,1.0313189,0.91166,1.1060662,1.0314976,2.159444,1.1721574,0.8072047,0.9158697,1.6986682,1.3622816,1.3435968,1.1220086,1.1700351,1.1838175,1.5725064,1.2174841,1.2138147,0.5248754,0.7249952,1.2749355,1.1661515,1.2648822,0.5740919,0.752948,0.879287,1.1281374,1.4051195,1.6132431,1.0423176,1.2080768,0.9232828,1.5921457,0.948232,0.7519143,…,1.3116796,0.6218146,1.2458134,1.319539,1.4236214,0.8613818,1.0179858,1.178218,0.339786,1.2891085,0.6082841,0.5126666,1.5057809,1.5723813,2.089919,1.37114,0.9522238,0.8407547,0.8666975,0.681446,0.978691,1.3891232,1.2881597,1.022711,1.0376573,1.0446404,1.2772572,0.9886265,1.3990523,0.6954912,1.0844231,1.1800522,1.139241,0.695935,1.3678448,3.8581703,1.2115645
50709,1.3014772,1.2845919,1.0603237,0.99731,1.1670156,1.0634767,2.3141923,1.2805418,0.845857,0.998167,1.9003232,1.5137389,1.5411793,1.2773501,1.2434914,1.3255355,1.6920849,1.3675518,1.3293021,0.711816,0.782976,1.3846077,1.2758167,1.3381801,0.57531,0.7825315,0.961226,1.2926133,1.5636761,1.7853148,1.1180578,1.2987791,0.97638,1.699074,1.1753376,0.8754751,…,1.3695168,0.736948,1.4648252,1.4764965,1.567291,0.892069,1.1567878,1.3444703,0.6089143,1.3555071,0.8543747,0.794685,1.6630685,1.7252915,2.3039649,1.5143151,1.0120987,0.9863243,1.0146372,0.7418891,1.1218561,1.5483927,1.3867316,1.0720313,1.1368136,1.0653042,1.3256581,1.0428888,1.5497099,0.88264,1.1213304,1.2178777,1.205434,0.8967751,1.5335677,3.9044342,1.3487965
50710,1.1229019,1.1085227,0.96808,0.8795913,1.088381,1.2214227,2.6344292,1.0276147,0.595992,0.8307127,1.6746883,1.3887286,1.3673583,1.1506523,1.0835774,1.1606451,1.6064306,1.3017013,1.2259747,0.492711,0.6563829,1.3479342,1.1325569,1.2018325,0.6448239,0.6491319,0.907478,1.204046,1.5330416,1.6017969,0.9663531,1.1579156,0.747945,1.5974716,1.0177462,0.6576992,…,1.0825974,0.5858834,1.2165188,1.3567803,1.5228009,0.8333435,0.9792615,1.2147768,0.461543,1.2979891,0.6119395,0.9702053,1.4319239,1.4803714,2.352606,1.3668377,0.7981844,0.775153,0.82452,0.638852,0.8963872,1.3814957,1.2886574,0.9784096,1.090427,0.9902588,1.1463172,1.0019,1.3006979,0.8740586,1.001282,1.087748,1.0052165,0.719569,1.759931,3.7801983,1.2126521
50711,1.3101046,1.2596134,1.0345381,0.9810789,1.174655,1.2472676,2.6612248,1.1868027,0.7001299,0.9482628,1.8798516,1.5430408,1.5530257,1.293074,1.1709387,1.2726218,1.7472824,1.4284544,1.3499123,0.6532406,0.735993,1.4552553,1.2573195,1.3086215,0.639422,0.706912,0.9750854,1.3398337,1.6695311,1.786054,1.0755404,1.2768188,0.859194,1.7424492,1.1944427,0.7972637,…,1.1603525,0.649872,1.3619101,1.4624078,1.6092418,0.8873898,1.0877794,1.322872,0.5397,1.3703873,0.725534,1.0061898,1.5925729,1.6467812,2.4417472,1.4851198,0.8913649,0.88015,0.921937,0.683,1.0209997,1.5052301,1.381404,1.042858,1.1503761,1.0470207,1.2303951,1.0725034,1.4374596,0.914721,1.0720842,1.1625565,1.0888495,0.79708,1.7526413,3.929058,1.3079548
