In [15]:
import re

import polars as pl
import plotnine as p9

In [16]:
COLUMN_NAMES = {
    "ensembl_ccds_tx_nt_seq.fa": "Human ORFs",
    "hrt_hk_nt_seq_pulled_from_ensembl.fa": "Housekeeping",
    "ena_cancer_vaccine_seq.fa": "Cancer",
    "ena_car-t_nt_seq.fa": "CAR-T",
    "antibody_monoclonal_aa_seqs.fasta": "Antibody",
    "iedb_antigen_aa_seqs.fa": "Antigen",
}

def _is_amino_acid_sequence(s: str) -> bool:
    return bool(re.search(r'[^ACGTU]+', s))

df = pl.concat(
    [
        pl.read_csv("optimized-sequences.csv"),
        pl.read_csv("output-csc.csv"),
        pl.read_csv("output-hamming-distance.csv"),
        pl.read_csv("output-hydrated.csv"),
        pl.read_csv("output-ribonn.csv"),
        pl.read_csv("output-degscore.csv"),
    ],
    how="align",
).with_columns(
    source_name=pl.col("source").replace_strict(COLUMN_NAMES),
    is_amino_acid=pl.col("raw_input_sequence").map_elements(_is_amino_acid_sequence, return_dtype=pl.Boolean),
)
df

index,source,name,raw_input_sequence,input_sequence,error,optimized_sequence,optimization_error,input_csc,optimized_csc,hamming_distance,hamming_ratio,input_a,input_c,input_g,input_t,input_gc,input_gc1,input_gc2,input_gc3,input_cpg,input_uridine_depletion,input_cai,input_tai,input_mfe,input_amfe,input_mfe_structure,input_wmfe,input_gini,input_slippery_site_ratio,input_rscu,input_rcbs,input_dcbs,input_rcr,input_cub,input_cbi,optimized_a,…,optimized_predicted_TE_RPE.1,optimized_predicted_TE_SH.SY5Y,optimized_predicted_TE_SUM159PT,optimized_predicted_TE_SW480TetOnAPC,optimized_predicted_TE_T47D,optimized_predicted_TE_THP.1,optimized_predicted_TE_U.251,optimized_predicted_TE_U.343,optimized_predicted_TE_U2392,optimized_predicted_TE_U2OS,optimized_predicted_TE_Vero_6,optimized_predicted_TE_WI38,optimized_predicted_TE_WM902B,optimized_predicted_TE_WTC.11,optimized_predicted_TE_ZR75.1,optimized_predicted_TE_cardiac_fibroblasts,optimized_predicted_TE_ccRCC,optimized_predicted_TE_early_neurons,optimized_predicted_TE_fibroblast,optimized_predicted_TE_hESC,optimized_predicted_TE_human_brain_tumor,optimized_predicted_TE_iPSC.differentiated_dopamine_neurons,optimized_predicted_TE_megakaryocytes,optimized_predicted_TE_muscle_tissue,optimized_predicted_TE_neuronal_precursor_cells,optimized_predicted_TE_neurons,optimized_predicted_TE_normal_brain_tissue,optimized_predicted_TE_normal_prostate,optimized_predicted_TE_primary_macrophages,optimized_predicted_TE_skeletal_muscle,optimized_mean_predicted_TE,input_degscore,input_est_half_life,optimized_degscore,optimized_est_half_life,source_name,is_amino_acid
i64,str,str,str,str,str,str,str,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,bool
0,"""antibody_monoclonal_aa_seqs.fa…","""RituximabTargetAnti-CD20v_heav…","""QVQLQQPGAELVKPGASVKMSCKASGYTFT…","""CAGGTGCAGCTGCAGCAGCCCGGCGCCGAG…",,"""CAGGTGCAGCTCCAGCAGCCAGGCGCCGAA…",,0.037988,0.033325,50,0.036955,0.233555,0.350333,0.302291,0.113821,0.652624,0.652624,0.465632,0.982262,0.060606,0.0,1.0,0.372443,-522.0,-0.385809,"""(((((.(.((((...))))..(((((((.(…",-8.831611,0.163581,0.0,3.567627,0.815107,2.385627,0.0,1.093384,1.0,0.232077,…,0.489875,0.6546557,-0.25973,1.0453405,-0.090936,0.200537,0.836385,0.8797067,1.7117574,0.8587179,0.522966,0.292864,0.3436048,0.4736609,0.4147542,0.8005847,0.921212,0.7710437,0.778489,0.849324,0.880333,0.7517811,0.7315919,0.390224,0.818275,0.8951181,0.7017768,0.155682,1.3600357,3.4466224,0.754483,409.421,0.736572,411.523,0.733017,"""Antibody""",true
1,"""antibody_monoclonal_aa_seqs.fa…","""RituximabTargetAnti-CD20v_ligh…","""QIVLSQSPAILSASPGEKVTMTCRASSSVS…","""CAGATCGTGCTGAGCCAGAGCCCCGCCATC…",,"""CAGATCGTGCTGAGCCAGAGCCCCGCCATC…",,0.036312,0.035658,15,0.023474,0.239437,0.348983,0.303599,0.107981,0.652582,0.652582,0.507042,0.971831,0.059468,0.0,1.0,0.362417,-227.5,-0.356025,"""......(((((..((....)).(((((.((…",-8.532667,0.167019,0.0,3.71831,0.856337,2.440252,0.0,1.11896,1.0,0.231612,…,0.883257,1.0325949,0.090755,1.2311943,0.369411,0.3254848,1.3621538,1.4288645,1.93615,1.2426196,0.8673431,0.6951815,0.7150792,0.6101947,0.837011,1.2391798,1.2066175,0.9703537,0.9496115,1.0197175,1.2176926,0.9532372,1.2390183,0.5368302,1.0375001,1.1294162,1.059294,0.491357,1.2789897,3.8240151,1.0833449,192.913,1.471603,195.884,1.451723,"""Antibody""",true
2,"""antibody_monoclonal_aa_seqs.fa…","""TrastuzumabTargetAnti-HER2_Lig…","""DIQMTQSPSSLSASVGDRVTITCRASQDVN…","""GACATCCAGATGACCCAGAGCCCCAGCAGC…",,"""GACATCCAGATGACCCAGAGCCCCAGCTCC…",,0.0358,0.034948,13,0.020249,0.244548,0.353583,0.28972,0.11215,0.643302,0.643302,0.476636,0.96729,0.05296,0.0,1.0,0.365163,-225.199997,-0.350779,"""..(((.....((.(.(((.(((((((((((…",-8.297351,0.166355,0.0,3.668224,0.893168,2.538889,0.0,1.112429,1.0,0.238318,…,0.8605016,1.0058205,0.0657,1.2133551,0.343,0.291535,1.3299663,1.3961701,1.9028994,1.2153448,0.854497,0.6739579,0.694494,0.6013955,0.8138114,1.2108796,1.185256,0.95712,0.931708,1.0089494,1.2047646,0.9360033,1.2129343,0.5173305,1.0251462,1.1166493,1.0464458,0.471589,1.2605698,3.7981772,1.0613674,196.898,1.445061,191.748,1.479547,"""Antibody""",true
3,"""antibody_monoclonal_aa_seqs.fa…","""TrastuzumabTargetAnti-HER2_Hea…","""EVQLVESGGGLVQPGGSLRLSCAASGFNIK…","""GAGGTGCAGCTGGTGGAGAGCGGCGGCGGC…",,"""GAGGTGCAGCTGGTGGAGAGCGGCGGCGGC…",,0.037603,0.034451,34,0.025185,0.232593,0.341481,0.308889,0.117037,0.65037,0.65037,0.462222,0.971111,0.055556,0.0,1.0,0.369346,-525.900024,-0.389556,""".(((((....(((((..((((.((((((((…",-8.746341,0.153111,0.0,3.593333,0.83995,2.454101,0.0,1.098031,1.0,0.231852,…,0.3680349,0.506829,-0.330879,0.927464,-0.177266,0.044738,0.6422909,0.684584,1.5439274,0.6982418,0.452758,0.186831,0.250605,0.43216,0.292467,0.6378528,0.7829395,0.681117,0.676628,0.7573891,0.7817484,0.6192398,0.586395,0.323925,0.730516,0.802656,0.6114734,0.097161,1.2853097,3.2314517,0.6384625,400.314,0.752379,400.012,0.752914,"""Antibody""",true
4,"""antibody_monoclonal_aa_seqs.fa…","""Bevacizumab_light_chain""","""DIQMTQSPSSLSASVGDRVTITCSASQDIS…","""GACATCCAGATGACCCAGAGCCCCAGCAGC…",,"""GACATCCAGATGACCCAGAGCCCCAGCTCC…",,0.036155,0.035877,12,0.018692,0.246106,0.345794,0.292835,0.115265,0.638629,0.638629,0.46729,0.976636,0.051402,0.0,1.0,0.365897,-228.899994,-0.356542,"""...............(((.((.......))…",-8.643046,0.166822,0.0,3.649533,0.839185,2.424216,0.0,1.105428,1.0,0.238318,…,0.846244,0.988835,0.06319,1.2002461,0.336841,0.2714213,1.3052473,1.3706836,1.8831927,1.1961652,0.845723,0.6628633,0.6859156,0.6015581,0.7999197,1.1907533,1.1693108,0.9473254,0.9222889,0.9979569,1.1916349,0.9179659,1.1955465,0.5148094,1.0156183,1.1068051,1.0353965,0.4716111,1.2532126,3.7693012,1.0488142,193.698,1.466297,190.187,1.490328,"""Antibody""",true
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
50708,"""iedb_antigen_aa_seqs.fa""","""sp|Q8IX19|MCEM1_HUMAN Mast cel…","""MEVEEIYKHQEVKMQAPAFRDKKQGVSAKN…","""ATGGAGGTGGAGGAGATCTACAAGCACCAG…",,"""ATGGAGGTGGAGGAGATCTACAAGCACCAG…",,0.032597,0.030882,11,0.019608,0.263815,0.304813,0.31016,0.121212,0.614973,0.614973,0.363636,0.951872,0.039216,0.0,1.0,0.398118,-204.600006,-0.364706,"""......(((((.....)))))..(((((..…",-8.216794,0.134091,0.0,3.406417,0.801909,2.473423,0.0,1.026497,1.0,0.26025,…,1.0179858,1.178218,0.339786,1.2891085,0.6082841,0.5126666,1.5057809,1.5723813,2.089919,1.37114,0.9522238,0.8407547,0.8666975,0.681446,0.978691,1.3891232,1.2881597,1.022711,1.0376573,1.0446404,1.2772572,0.9886265,1.3990523,0.6954912,1.0844231,1.1800522,1.139241,0.695935,1.3678448,3.8581703,1.2115645,174.272,1.609925,173.675,1.614786,"""Antigen""",true
50709,"""iedb_antigen_aa_seqs.fa""","""sp|Q9D287|SPF27_MOUSE Pre-mRNA…","""MAGTGLVAGEVVVDALPYFDQGYEAPGVRE…","""ATGGCCGGCACCGGCCTGGTGGCCGGCGAG…",,"""ATGGCCGGCACCGGCTTAGTGGCCGGCGAG…",,0.031202,0.027872,10,0.014815,0.278519,0.29037,0.318519,0.112593,0.608889,0.608889,0.333333,0.937778,0.062222,0.0,1.0,0.404974,-227.300003,-0.336741,"""...(((((....(((((.((((((((((((…",-7.78805,0.148333,0.0,3.302222,0.855821,2.833575,0.0,1.007132,1.0,0.282963,…,1.1567878,1.3444703,0.6089143,1.3555071,0.8543747,0.794685,1.6630685,1.7252915,2.3039649,1.5143151,1.0120987,0.9863243,1.0146372,0.7418891,1.1218561,1.5483927,1.3867316,1.0720313,1.1368136,1.0653042,1.3256581,1.0428888,1.5497099,0.88264,1.1213304,1.2178777,1.205434,0.8967751,1.5335677,3.9044342,1.3487965,205.05,1.393641,208.32,1.374029,"""Antigen""",true
50710,"""iedb_antigen_aa_seqs.fa""","""tr|Q4CN05|Q4CN05_TRYCC Trans-s…","""MSRHLFYSAVLLLLVVMCCGTAAVNAEELS…","""ATGAGCAGACACCTGTTCTACAGCGCCGTG…",,"""ATGAGCAGACACCTGTTCTACAGCGCCGTG…",,0.038305,0.034225,41,0.025738,0.238544,0.289391,0.355932,0.116133,0.645323,0.645323,0.403013,0.964218,0.085374,0.0,1.0,0.3912,-631.400024,-0.396359,"""..((((((....))))))....(((((.((…",-9.150129,0.17161,0.0,3.497175,0.80565,2.478932,0.0,1.07134,1.0,0.244193,…,0.9792615,1.2147768,0.461543,1.2979891,0.6119395,0.9702053,1.4319239,1.4803714,2.352606,1.3668377,0.7981844,0.775153,0.82452,0.638852,0.8963872,1.3814957,1.2886574,0.9784096,1.090427,0.9902588,1.1463172,1.0019,1.3006979,0.8740586,1.001282,1.087748,1.0052165,0.719569,1.759931,3.7801983,1.2126521,462.356,0.656411,464.0,0.6542,"""Antigen""",true
50711,"""iedb_antigen_aa_seqs.fa""","""tr|Q5NG75|Q5NG75_FRATT Amino-a…","""MDNNQDKLKRDILSRHIVMISLGGTISASF…","""ATGGACAACAACCAGGACAAGCTGAAGAGA…",,"""ATGGACAACAACCAGGACAAGCTGAAGAGA…",,0.050564,0.047246,28,0.020513,0.208059,0.317216,0.271795,0.20293,0.589011,0.589011,0.362637,0.978022,0.058608,0.0,1.0,0.388957,-524.700012,-0.384396,"""..((((......(((((..(((((((.(((…",-8.451807,0.187088,0.0,3.501099,0.743237,2.222298,0.0,1.062725,1.0,0.208791,…,1.0877794,1.322872,0.5397,1.3703873,0.725534,1.0061898,1.5925729,1.6467812,2.4417472,1.4851198,0.8913649,0.88015,0.921937,0.683,1.0209997,1.5052301,1.381404,1.042858,1.1503761,1.0470207,1.2303951,1.0725034,1.4374596,0.914721,1.0720842,1.1625565,1.0888495,0.79708,1.7526413,3.929058,1.3079548,424.147,0.712371,430.387,0.702589,"""Antigen""",true


In [None]:
def _labeller(label: str) -> str:
    if label.startswith("input"):
        return "Wildtype"
    return "Optimized"

def _plot_faceted(df: pl.DataFrame, metric: str, label: str):
    df_ = df.select(
        "index", "source_name", f"input_{metric}", f"optimized_{metric}"
    ).unpivot(
        variable_name="type",
        value_name=metric,
        index=["index", "source_name"]
    )
    if metric == "mfe":
        df_ = df_.filter(pl.col(metric) >= -3000)
    if metric == "est_half_life":
        df_ = df_.filter(pl.col(metric) <= 3)
    plot = (
        p9.ggplot(
            df_,
            p9.aes(x="source_name", y=metric)
        )
        + p9.geom_violin(draw_quantiles=[0.25, 0.5, 0.75], fill="steelblue")
        + p9.theme_minimal()
        + p9.theme(
            axis_text_x=p9.element_text(rotation=45, hjust=1),
            axis_title_x=p9.element_blank(),
            figure_size=(8, 3),
        )
        + p9.labs(
            x=None,
            y=label,
        )
        + p9.facet_wrap("type", labeller=_labeller)
    )
    return plot

def _plot_delta(df: pl.DataFrame, metric: str, label: str):
    df_ = df.with_columns(
        (
            ((pl.col(f"optimized_{metric}") / pl.col(f"input_{metric}")) - 1)
            * 100
        ).alias(f"diff_{metric}")
    )
    # if metric == "mfe":
    #     df_ = df_.filter(pl.col(metric) >= -3000)
    # if metric == "est_half_life":
    #     df_ = df_.filter(pl.col(metric) <= 3)
    plot = (
        p9.ggplot(
            df_,
            p9.aes(x="source_name", y=f"diff_{metric}")
        )
        + p9.geom_violin(draw_quantiles=[0.25, 0.5, 0.75], fill="steelblue")
        + p9.theme_minimal()
        + p9.theme(
            axis_text_x=p9.element_text(rotation=45, hjust=1),
            axis_title_x=p9.element_blank(),
            figure_size=(5, 4),
        )
        + p9.labs(
            x=None,
            y=label,
        )
    )
    return plot

def _plot(df: pl.DataFrame, metric: str, label: str):
    plot = (
        p9.ggplot(
            df,
            p9.aes(x="source_name", y=metric)
        )
        + p9.geom_violin(draw_quantiles=[0.25, 0.5, 0.75], fill="steelblue")
        + p9.theme_minimal()
        + p9.theme(
            axis_text_x=p9.element_text(rotation=45, hjust=1),
            axis_title_x=p9.element_blank(),
            figure_size=(4, 3),
        )
        + p9.labs(
            x=None,
            y=label,
        )
    )
    return plot

# _plot(df, "input_csc", "Codon Stability Coefficient (CSC)")
# _plot(df, "hamming_ratio", "Hamming ratio")

_METRICS = [
    ("cai", "CAI"),
    ("gc", "GC content"),
    ("uridine_depletion", "Uridine depletion"),
    ("mfe", "MFE"),
    ("csc", "Codon Stability Coefficient"),
    ("est_half_life", "Predicted half-life"),
    ("mean_predicted_TE", "Predicted translation"),
    ("rcr", "Rare codons"),
]
for metric, label in _METRICS:
    _plot_faceted(df, metric, label).save(f"./plots/{metric}.svg")

for metric, label in _METRICS:
    _plot_delta(df, metric, f"{label} - Relative change (%)").save(f"./plots/{metric}-change.svg")

_plot(df, "hamming_ratio", "Hamming ratio").save("./plots/hamming-ratio.svg")

