In [None]:
import pandas as pd
import plotly.express as px
import statsmodels.stats.descriptivestats

import sys
sys.path.append('../')
import plotting

# Read reference data

In [None]:
ref_df = pd.read_csv('./sequence_data_anonymized_no_duplicates.csv')
ref_df.drop(columns=['seq'], inplace=True)
ref_df['insert_motif_position'] = ref_df['insert_motif_position'].map({
    "front": "5' end", 
    "5ntfront": "5' + 5 nt", 
    "middle": "Middle", 
    "5ntback": "3' - 5 nt", 
    "back": "3' end"
})

ref_df

# Read model data for experiment

In [None]:
full_dfs = {}

for exp in ['GCall_fix', 'Erlich_et_al', 'Erlich_et_al_internalrepeat']:
    param_df = pd.read_csv(f'../data/internal_datasets/validation_{exp}/params.csv')
    param_df.rename(columns={'seq_id': 'seq_id_anonymized'}, inplace=True)
    mergeddf = pd.merge(ref_df, param_df, on='seq_id_anonymized')
    full_dfs[exp] = mergeddf

full_dfs["GCall_fix"]

# Select inserted sequences and calculate change in efficiency

In [None]:
motif_dfs = {}

for exp in ['GCall_fix', 'Erlich_et_al', 'Erlich_et_al_internalrepeat']:
    motif_df = full_dfs[exp].copy()
    motif_df = motif_df[motif_df.has_insertedmotif]
    motif_df['eff_sourceseq'] = motif_df['source_seqid'].map(full_dfs[exp].set_index('seq_id')['eff'])
    motif_df['eff_diff'] = motif_df['eff'] - motif_df['eff_sourceseq']
    motif_dfs[exp] = motif_df

motif_dfs["GCall_fix"]

# Aggregate data by motif and position

In [None]:
def agg(s):
    d = statsmodels.stats.descriptivestats.describe(
        s.eff_diff.values, 
        stats=["nobs", "mean", "median", "ci"], 
        alpha=0.05/50, 
        use_t=True
    )[0]
    d['delta_ci'] = d['upper_ci'] - d['mean']
    d['mean%'] = 100*d['mean']
    d['median%'] = 100*d['median']
    return d

plot_dfs = {}

for exp in ['GCall_fix', 'Erlich_et_al', 'Erlich_et_al_internalrepeat']:
    plot_df = motif_dfs[exp].groupby(['insert_motif', 'insert_motif_position']).apply(agg, include_groups=False)
    plot_df.reset_index(inplace=True)
    plot_dfs[exp] = plot_df

plot_dfs['GCall_fix']

# Heatmap

In [None]:
heatmap_dfs = {}

for exp in ['GCall_fix', 'Erlich_et_al', 'Erlich_et_al_internalrepeat']:
    heatmap_df = plot_dfs[exp].pivot(index="insert_motif", columns="insert_motif_position", values="mean%")
    heatmap_df = heatmap_df[["5' end", "5' + 5 nt", "Middle", "3' - 5 nt", "3' end"]].copy()
    heatmap_df = heatmap_df.reindex(["CGTGT", "GTCGTG", "TCGTGT", "CGTG", "CGTGTG", "TATT", "TATA", "AACA", "TATAT", "GCAC"], axis=0)
    heatmap_dfs[exp] = heatmap_df

heatmap_dfs['GCall_fix']

In [None]:
for exp in ['GCall_fix', 'Erlich_et_al', 'Erlich_et_al_internalrepeat']:
    heatmap_df = heatmap_dfs[exp]

    fig = px.imshow(
        heatmap_df,
        x=heatmap_df.columns,
        y=heatmap_df.index,
        text_auto=".1f",
        color_continuous_scale=[(0, "#de2d26"), (0.5, "white"), (1, "#3182bd")],
        color_continuous_midpoint=0,
        range_color=[-5, 5],
    )
    fig.update_layout(
        width=300,
        height=300,
        showlegend=False,
        margin=dict(l=0, r=10, b=0, t=10),
    )

    if exp == "Taq":
        fig.update_layout(yaxis={'side': 'right'})

    fig.update_layout(coloraxis_colorbar=dict(
        title="Deltae",
        title_font_family="Inter", 
        title_font_size=28/3, 
        tickfont_size=28/3,
        thicknessmode="pixels", 
        thickness=10,
        lenmode="pixels", 
        len=150,
        dtick=2,
        yanchor="top", y=0.8,
    ))


    fig.update_xaxes(title_text=None, tickangle=45)
    fig.update_yaxes(title_text=None)
    fig.update(layout_coloraxis_showscale=True)
    fig = plotting.standardize_plot(fig)
    fig.write_image(f'./figure_6_motif_effect/heatmap_{exp}.svg')
    fig.show()

    # save data
    heatmap_df.to_csv(f'./figure_6_motif_effect/heatmap_{exp}.csv')

# Boxplot

In [None]:
motif_order = [
    "CGTGT",
    "TATT",
    "GTCGTG",
    "TATA",
    "TCGTGT",
    "AACA",
    "CGTG",
    "TATAT",
    "CGTGTG",
    "GCAC",
]
position_order = [
    "5' end",
    "5' + 5 nt",
    "Middle",
    "3' - 5 nt",
    "3' end",
]
for exp in ['GCall_fix', 'Erlich_et_al', 'Erlich_et_al_internalrepeat']:
    motif_df = motif_dfs[exp].copy()
    fig = px.box(
        motif_df, 
        x='insert_motif_position', 
        y='eff_diff', 
        facet_col='insert_motif', 
        facet_col_wrap=2,
        facet_row_spacing=0.04,
        boxmode='overlay',
        points='all',
        category_orders={'insert_motif_position': position_order, 'insert_motif': motif_order},
        color_discrete_sequence=["#3182bd"],
    )
    fig.add_hline(y=0, line_dash="dot", line_color="black", line_width=1, opacity=1)
    fig.add_hline(y=-0.025, line_color="#de2d26", line_width=1, opacity=1)
    fig.update_traces(marker=dict(size=4, opacity=0.5))

    fig.update_xaxes(title_text="Motif position", row=1)
    fig.update_yaxes(title_text="Δε", col=1)
    fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))

    fig.update_yaxes(
        range=[min(-0.22, motif_df.eff_diff.min()-0.005), motif_df.eff_diff.max()+0.005],
        dtick=0.1,
        minor_dtick=0.025,
    )

    for i, motif in enumerate(motif_order):
        row = 5 - i // 2
        col = i % 2 + 1
        for j, pos in enumerate(position_order):
            idf = motif_df[(motif_df.insert_motif == motif) & (motif_df.insert_motif_position == pos)]
            d = statsmodels.stats.descriptivestats.describe(idf.eff_diff.values, stats=["nobs", "mean", "ci"], alpha=0.05/50, use_t=True)[0]
            d['delta_ci'] = d['upper_ci'] - d['mean']
            fig.add_annotation(
                x=pos,
                y=-0.17,
                text=f"n={d['nobs']:.0f}<br>{100*d['mean']:.1f}%<br>±{100*d['delta_ci']:.1f}%",
                showarrow=False,
                row=row,
                col=col,
            )

    fig.update_layout(
        showlegend=False,
        margin=dict(l=0, r=10, b=0, t=20),
        width=680,
        height=800,
    )
    fig = plotting.standardize_plot(fig)
    fig.write_image(f"./SI_figure_motif_effect/comp_{exp}.svg")
    fig.show()

    # save data
    motif_df.to_csv(f"./SI_figure_motif_effect/comp_{exp}.csv", index=False)