In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

import sys
sys.path.append('..')
import plotting

## position distribution visualization

### for GCall 1 and GCfix 1 motifs

In [None]:
for filename in ['GCall', 'GCfix']:
    bot_seqs_positions, rest_seqs_positions = pd.read_pickle(
        "../data/machine_learning_results/motifs_positions/{}_motif_positions.pkl".format(
            filename
        )
    )
    # this loop flattens the nested dictionary of position per motif per sequence
    flattened_bot_seqs_positions = []
    for i in range(len(bot_seqs_positions[0])):

        if len(bot_seqs_positions[0][i]) != 0:
            flattened_bot_seqs_positions.extend(bot_seqs_positions[0][i])

    flattened_rest_seqs_positions = []
    for i in range(len(rest_seqs_positions[0])):
        if len(rest_seqs_positions[0][i]) != 0:
            flattened_rest_seqs_positions.extend(rest_seqs_positions[0][i])

    # combine to dataframe
    bdf = pd.DataFrame.from_dict({'val': flattened_bot_seqs_positions})
    bdf['cat'] = "bot"
    ndf = pd.DataFrame.from_dict({'val': flattened_rest_seqs_positions})
    ndf['cat'] = "normal"
    df = pd.concat([bdf, ndf])

    fig = px.histogram(
        df,
        x="val",
        color="cat",
        histnorm="probability",
        barmode="overlay",
        color_discrete_map={"bot": "#de2d26", "normal": "#aaaaaa"},
    )
    fig.data = fig.data[::-1]
    fig.update_traces(marker=dict(line_width=0), selector=dict(type='histogram')) 
    fig.update_traces(xbins=dict(start=0.0, size=5), selector=dict(type='histogram'))
    fig.update_xaxes(title_text="Position in sequence", dtick=50, minor_dtick=25)
    fig.update_yaxes(title_text="Motif frequency", dtick=0.1, minor_dtick=0.05)
    fig.update_layout(
        width=150,
        height=150,
        margin=dict(l=0, r=10, t=10, b=0),
        showlegend=False,
    )

    fig = plotting.standardize_plot(fig)
    fig.show()
    fig.write_image("./figure_4_motif_replacement/{}_motif_positions_histogram.svg".format(filename))

    # also save data
    df.to_csv("./figure_4_motif_replacement/{}_motif_positions_histogram_data.csv".format(filename), index=False)

### for top 3 of all datasets

In [None]:
for filename in [
    "GCfix",
    "GCall",
    "Gao_et_al",
    "Erlich_et_al",
    "Koch_et_al",
    "Song_et_al",
    "Choi_et_al",
]:
    bot_seqs_positions, rest_seqs_positions = pd.read_pickle(
        "../data/machine_learning_results/motifs_positions/{}_motif_positions.pkl".format(
            filename
        )
    )
    l = min(3, len(bot_seqs_positions))
    for k in range(l):
        # k: index of motif sorted by p-value
        # i: index of sequence

        # this loop flattens the nested dictionary of position per motif per sequence
        flattened_bot_seqs_positions = []
        for i in range(len(bot_seqs_positions[k])):

            if len(bot_seqs_positions[k][i]) != 0:
                flattened_bot_seqs_positions.extend(bot_seqs_positions[k][i])

        flattened_rest_seqs_positions = []
        for i in range(len(rest_seqs_positions[k])):
            if len(rest_seqs_positions[k][i]) != 0:
                flattened_rest_seqs_positions.extend(rest_seqs_positions[k][i])

        # combine to dataframe
        bdf = pd.DataFrame.from_dict({'val': flattened_bot_seqs_positions})
        bdf['cat'] = "bot"
        ndf = pd.DataFrame.from_dict({'val': flattened_rest_seqs_positions})
        ndf['cat'] = "normal"
        df = pd.concat([bdf, ndf])

        fig = px.histogram(
            df,
            x="val",
            color="cat",
            histnorm="probability",
            barmode="overlay",
            color_discrete_map={"bot": "#de2d26", "normal": "#aaaaaa"},
        )
        fig.data = fig.data[::-1]
        fig.update_traces(marker=dict(line_width=0), selector=dict(type='histogram')) 
        fig.update_traces(xbins=dict(start=0.0, size=5), selector=dict(type='histogram'))
        fig.update_xaxes(title_text="Position in sequence", dtick=50, minor_dtick=25)
        fig.update_yaxes(title_text="Motif frequency")
        fig.update_layout(
            width=210,
            height=100,
            margin=dict(l=0, r=10, t=10, b=35),
            showlegend=False,
        )

        fig = plotting.standardize_plot(fig)
        fig.show()

        fig.write_image("./SI_figure_motifs/{}_{}_motif_positions_histogram.svg".format(filename, k))

        # also save data
        df.to_csv("./SI_figure_motifs/{}_{}_motif_positions_histogram_data.csv".format(filename, k), index=False)