In [1]:
import subprocess

import polars as pl

from mrnarchitect.constants.sequences import (
    EGFP,
    LUCIFERASE,
    OVALBUMIN,
    CAS9,
    SARSCOV2_SPIKE,
    COL1A1,
)
from mrnarchitect.sequence.optimize import optimize, OptimizationParameter
from mrnarchitect.sequence import Sequence

In [2]:
def codon_transformer(protein: str) -> str:
    result = subprocess.run(
        [
            "docker",
            "run",
            "-i",
            "codontransformer:latest",
            protein,
            "Homo sapiens",  # human
        ],
        capture_output=True,
        text=True,
    )
    if result.returncode != 0:
        raise RuntimeError(f"CodonTransformer failed to run: {result.stderr}")

    return result.stdout.strip()


def gemorna(protein: str) -> str:
    result = subprocess.run(
        [
            "docker",
            "run",
            "-i",
            "gemorna:latest",
            "python",
            "src/generate.py",
            "--mode",
            "cds",
            "--ckpt_path",
            "checkpoints/gemorna_cds.pt",
            "--protein_seq",
            protein,
        ],
        capture_output=True,
        text=True,
    )
    if result.returncode != 0:
        raise RuntimeError(f"GEMORNA failed to run: {result.stderr}")

    output = [
        it.strip().split()[0]
        for it in result.stdout.split("\n")
        if it.strip() and "Generated" not in it
    ]
    if not output:
        raise RuntimeError(f"GEMORNA could not parse output: {result.stdout}")

    return output[0]


def lineardesign(protein: str, lambda_: float = 1.0) -> str:
    result = subprocess.run(
        [
            # "podman",
            # "run",
            # "-i",
            # "--oom-kill-disable",
            # "--memory-swap",
            # "-1",
            # "lineardesign:latest",
            # "bin/LinearDesign_2D",
            "bin/LinearDesign_2D",
            str(lambda_),
            "0",
            "codon_usage_freq_table_human.csv",
        ],
        cwd="/home/jon/dev/base/LinearDesign/",
        capture_output=True,
        text=True,
        input=protein,
    )
    if result.returncode != 0:
        raise RuntimeError(
            f"LinearDesign failed to run: {result.returncode} {result.stdout} {result.stderr}"
        )

    output = [it for it in result.stdout.split("\n") if it.startswith("mRNA sequence:")]
    if not output:
        raise RuntimeError(f"LinearDesign could not parse output: {result.stdout}")

    return output[0].strip().split()[-1]


def custom(protein: str, top: int = 1) -> list[str]:
    result = subprocess.run(
        [
            "docker",
            "run",
            "-i",
            "custom:latest",
            "--top",
            str(top),
            protein.strip("*"),
        ],
        capture_output=True,
        text=True,
    )
    if result.returncode != 0:
        raise RuntimeError(f"CUSTOM failed to run: {result.stderr}")
    return [
        it for it in result.stdout.strip().split("\n") if not it.startswith("Warning")
    ]


def mrnarchitect_tai(protein: str) -> str:
    parameter = OptimizationParameter(
        enforce_sequence=False,
        codon_usage_table="homo-sapiens",
        optimize_tai=1.0,
        avoid_repeat_length=10,
        enable_uridine_depletion=False,
        avoid_ribosome_slip=False,
        avoid_manufacture_restriction_sites=False,
        avoid_micro_rna_seed_sites=False,
        gc_content_min=0.4,
        gc_content_max=0.7,
        gc_content_window=100,
        avoid_restriction_sites=[],
        avoid_sequences=[],
        avoid_poly_a=9,
        avoid_poly_c=6,
        avoid_poly_g=6,
        avoid_poly_t=9,
        hairpin_stem_size=10,
        hairpin_window=60,
    )
    result = optimize(Sequence.create(protein), parameters=[parameter])
    return str(result.result.sequence)


def max_cai(protein: str) -> str:
    s = Sequence.create(protein)
    assert s.codon_adaptation_index() == 1
    return str(s)

In [3]:
CRE_RECOMBINASE = "MSNLLTVHQNLPALPVDATSDEVRKNLMDMFRDRQAFSEHTWKMLLSVCRSWAAWCKLNNRKWFPAEPEDVRDYLLYLQARGLAVKTIQQHLGQLNMLHRRSGLPRPSDSNAVSLVMRRIRKENVDAGERAKQALAFERTDFDQVRSLMENSDRCQDIRNLAFLGIAYNTLLRIAEIARIRVKDISRTDGGRMLIHIGRTKTLVSTAGVEKALSLGVTKLVERWISVSGVADDPNNYLFCRVRKNGVAAPSATSQLSTRALEGIFEATHRLIYGAKDDSGQRYLAWSGHSARVGAARDMARAGVSIPEIMQAGGWTNVNIVMNYIRNLDSETGAMVRLLEDGD**"
CD19 = "MLLLVTSLLLCELPHPAFLLIPDIQMTQTTSSLSASLGDRVTISCRASQDISKYLNWYQQKPDGTVKLLIYHTSRLHSGVPSRFSGSGSGTDYSLTISNLEQEDIATYFCQQGNTLPYTFGGGTKLEITGSTSGSGKPGSGEGSTKGEVKLQESGPGLVAPSQSLSVTCTVSGVSLPDYGVSWIRQPPRKGLEWLGVIWGSETTYYNSALKSRLTIIKDNSKSQVFLKMNSLQTDDTAIYYCAKHYYYGGSYAMDYWGQGTSVTVSSAAAIEVMYPPPYLDNEKSNGTIIHVKGKHLCPSPLFPGPSKPFWVLVVVGGVLACYSLLVTVAFIIFWVRSKRSRLLHSDYMNMTPRRPGPTRKHYQPYAPPRDFAAYRSRVKFSRSADAPAYQQGQNQLYNELNLGRREEYDVLDKRRGRDPEMGGKPRRKNPQEGLYNELQKDKMAEAYSEIGMKGERRRGKGHDGLYQGLSTATKDTYDALHMQALPPR**"
HA_H1N1 = "MKAILVVLLYTFATANADTLCIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLRGVAPLHLGKCNIAGWILGNPECESLSTASSWSYIVETSSSDNGTCYPGDFIDYEELREQLSSVSSFERFEIFPKTSSWPNHDSNKGVTAACPHAGAKSFYKNLIWLVKKGNSYPKLSKSYINDKGKEVLVLWGIHHPSTSADQQSLYQNADAYVFVGSSRYSKKFKPEIAIRPKVRDQEGRMNYYWTLVEPGDKITFEATGNLVVPRYAFAMERNAGSGIIISDTPVHDCNTTCQTPKGAINTSLPFQNIHPITIGKCPKYVKSTKLRLATGLRNVPSIQSRGLFGAIAGFIEGGWTGMVDGWYGYHHQNEQGSGYAADLKSTQNAIDEITNKVNSVIEKMNTQFTAVGKEFNHLEKRIENLNKKVDDGFLDIWTYNAELLVLLENERTLDYHDSNVKNLYEKVRSQLKNNAKEIGNGCFEFYHKCDNTCMESVKNGTYDYPKYSEEAKLNREEIDGVKLESTRIYQILAIYSTVASSLVLVVSLGAISFWMCSNGSLQC*"

PROTEINS = {
    "Luciferase": LUCIFERASE,
    "Cre recombinase": CRE_RECOMBINASE,
    "Anti-hCD19-h28z CAR": CD19,
    "HA/H1N1": HA_H1N1,
    "EGFP": EGFP,
    "Ovalbumin": OVALBUMIN,
    "Cas9": CAS9,
    "SARS-CoV-2": SARSCOV2_SPIKE,
    "COL1A1": COL1A1,
}

results = []

for name, protein in sorted(PROTEINS.items(), key=lambda x: len(x[1]), reverse=True):
    # CodonTransformer
    # print(f"{name}: Processing CodonTransformer...")
    # results.append({
    #     "algorithm": "CodonTransformer",
    #     "name": name,
    #     "sequence": codon_transformer(protein),
    # })

    # CUSTOM
    # print(f"{name}: Processing CUSTOM...")
    # for output_sequence in custom(protein, top=10):
    #     results.append({
    #         "algorithm": "CUSTOM",
    #         "name": name,
    #         "sequence": output_sequence,
    #     })

    # GEMORNA
    # print(f"{name}: Processing GERMORNA...")
    # with concurrent.futures.ProcessPoolExecutor() as executor:
    #     results.extend([
    #         {
    #             "algorithm": "GEMORNA",
    #             "name": name,
    #             "sequence": it
    #         }
    #         for it in executor.map(
    #             gemorna,
    #             [protein] * 10,
    #         )
    #     ])

    # LinearDesign
    # print(f"{name}: Processing LinearDesign...")
    # lambdas = list(range(0, 11))
    # with concurrent.futures.ProcessPoolExecutor(1) as executor:
    #     futures = [
    #         executor.submit(
    #             lineardesign,
    #             protein,
    #             lambda_
    #         )
    #         for lambda_ in lambdas
    #     ]
    #     for result in concurrent.futures.as_completed(futures):
    #         try:
    #             results.append({
    #                 "algorithm": "LinearDesign",
    #                 "name": name,
    #                 "sequence": result.result(),
    #             })
    #         except RuntimeError as e:
    #             print(e)

    print(f"{name}: mRNArchitect - tAI")
    result = mrnarchitect_tai(protein)
    results.append(
        {
            "algorithm": "mRNArchitect - tAI",
            "name": name,
            "sequence": result,
        }
    )

    # print(f"{name}: Max CAI")
    # result = max_cai(protein)
    # results.append({"algorithm": "Max CAI", "name": name, "sequence": result})

df = pl.DataFrame(results)
df.write_csv("sequence-lineardesign.csv")

SARS-CoV-2: Processing LinearDesign...
