In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import Bio.Seq

import sys
sys.path.append('../')
import plotting

In [None]:
adapter_0F = Bio.Seq.Seq("ACACGACGCTCTTCCGATCT")
adapter_0R = Bio.Seq.Seq("AGATCGGAAGAGCACACGTCT")

motifs = [
    "CGTGT",
    "GTCGTG",
    "TCGTGT",
    "CGTG",
    "CGTGTG"
]

### Generate the subsequences of motifs with the correct orientation of adapters

In [None]:
combinations = {}

# prepare combinations with 0F (-> requires rev-comp orientation for inhibition)
for motif in motifs:
    combinations[f"0F_{motif}_end"] = (adapter_0F + motif.lower()).reverse_complement()
    combinations[f"0F_{motif}_5nt"] = (adapter_0F + 5*"N" +  motif.lower()).reverse_complement()

# prepare combinations with 0R (-> requires normal orientation for inhibition)
for motif in motifs:
    combinations[f"0R_{motif}_end"] =  motif.lower() + adapter_0R
    combinations[f"0R_{motif}_5nt"] =  motif.lower() + 5*"N" + adapter_0R

for key, value in combinations.items():
    combinations[key] = str(value)
combinations = pd.DataFrame.from_dict(combinations, orient="index", columns=["sequence"])
combinations.reset_index(inplace=True)
combinations.rename(columns={"index": "name"}, inplace=True)
combinations

### For each subsequence, submit it to http://www.unafold.org/mfold/applications/dna-folding-form.php using default settings with the exception of 54°C, 50 mM Na+, and 300% percent suboptimality (the latter only if correct folding is not in default set)

In [None]:
mfold_results = {
    "acacgAGATCGGAAGAGCGTCGTGT": 0.42,
    "acacgNNNNNAGATCGGAAGAGCGTCGTGT": 1.86,
    "cacgacAGATCGGAAGAGCGTCGTGT": 0.11,
    "cacgacNNNNNAGATCGGAAGAGCGTCGTGT": 1.26,
    "acacgaAGATCGGAAGAGCGTCGTGT": 0.37,
    "acacgaNNNNNAGATCGGAAGAGCGTCGTGT": 1.19,
    "cacgAGATCGGAAGAGCGTCGTGT": 1.19,
    "cacgNNNNNAGATCGGAAGAGCGTCGTGT": 2.63,
    "cacacgAGATCGGAAGAGCGTCGTGT": 0.12,
    "cacacgNNNNNAGATCGGAAGAGCGTCGTGT": 1.56,
    "cgtgtAGATCGGAAGAGCACACGTCT": 1.12,
    "cgtgtNNNNNAGATCGGAAGAGCACACGTCT": 1.82,
    "gtcgtgAGATCGGAAGAGCACACGTCT": 1.16,
    "gtcgtgNNNNNAGATCGGAAGAGCACACGTCT": 2.31,
    "tcgtgtAGATCGGAAGAGCACACGTCT": 0.91,
    "tcgtgtNNNNNAGATCGGAAGAGCACACGTCT": 1.61,
    "cgtgAGATCGGAAGAGCACACGTCT": 1.37,
    "cgtgNNNNNAGATCGGAAGAGCACACGTCT": 2.52,
    "cgtgtgAGATCGGAAGAGCACACGTCT": -0.06,
    "cgtgtgNNNNNAGATCGGAAGAGCACACGTCT": 1.02
}

mfold_results = pd.DataFrame.from_dict(mfold_results, orient="index", columns=["dG"])
mfold_results.reset_index(inplace=True)
mfold_results.rename(columns={"index": "sequence"}, inplace=True)
mfold_results

# Functions

In [None]:
def merge_with(combinations, mfold_results, efficiency_data):

    # compile efficiency data
    efficiency_results = {}
    for motif in motifs:
        row = efficiency_data.loc[efficiency_data["insert_motif"] == motif]
        efficiency_results[f"0F_{motif}_end"] = row["5' end"].values[0]
        efficiency_results[f"0F_{motif}_5nt"] = row["5' + 5 nt"].values[0]
        efficiency_results[f"0R_{motif}_end"] = row["3' end"].values[0]
        efficiency_results[f"0R_{motif}_5nt"] = row["3' - 5 nt"].values[0]
    efficiency_results = pd.DataFrame.from_dict(efficiency_results, orient="index", columns=["Delta_eff"])
    efficiency_results.reset_index(inplace=True)
    efficiency_results.rename(columns={"index": "name"}, inplace=True)

    # merge all data
    df = pd.merge(combinations, mfold_results, on="sequence")
    df = pd.merge(df, efficiency_results, on="name")
    df[['adapter', 'motif', 'end']] = df['name'].str.split('_', expand=True)
    df['dG_J'] = 4184*df['dG']
    df['K'] = np.exp(-df['dG_J']/(8.314*(273.15+54)))
    df['alpha'] = 1 - 1/(1+df['K'])
    return df


def plot(df):
    fig = px.scatter(
        df,
        x="alpha",
        y="Delta_eff",
        hover_data=["adapter", "end", "motif"],
        trendline="ols",
        trendline_options={"add_constant": False},
        trendline_color_override="gray",
    )
    fig.update_traces(line_dash="dash")

    # add R2
    trendline = px.get_trendline_results(fig)
    fig.add_annotation(
        x=0.85,
        y=0.95,
        xref="paper",
        yref="paper",
        text=f"R² = {trendline.iloc[0]['px_fit_results'].rsquared:.2f}",
        showarrow=False,
    )

    # add spearman correlation
    spearman = df[["alpha", "Delta_eff"]].corr(method="spearman").iloc[0, 1]
    fig.add_annotation(
        x=0.85,
        y=0.875,
        xref="paper",
        yref="paper",
        text=f"ρ = {spearman:.2f}",
        showarrow=False,
    )

    fig.update_layout(
        width=320,
        height=200,
        margin=dict(l=0, r=10, t=10, b=0),
    )
    fig.update_xaxes(title_text="Degree of association at 54°C (α)", range=[0, 0.55])
    fig.update_yaxes(title_text="Change in efficiency (Δϵ in %)", range=[-5, 0])

    fig = plotting.standardize_plot(fig)
    fig.show()
    return fig

# Plot data of GCall/fix

In [None]:
efficiency_data = pd.read_csv("../43_external_validation_motifs/figure_6_motif_effect/heatmap_GCall_fix.csv")
df = merge_with(combinations, mfold_results, efficiency_data)
fig = plot(df)
fig.write_image("./SI_figure_correlation/GCall_data.svg")

# Plot data of Erlich et al.

In [None]:
efficiency_data = pd.read_csv("../43_external_validation_motifs/figure_6_motif_effect/heatmap_Erlich_et_al.csv")
df = merge_with(combinations, mfold_results, efficiency_data)
fig = plot(df)
fig.write_image("./SI_figure_correlation/Erlich_data.svg")