In [None]:
import pandas as pd
import numpy as np
import Bio.SeqIO as SeqIO
import plotly.express as px
import re

import sys
sys.path.append('../')
import plotting

# Function to find kmers

In [None]:
def compose_df(filepath):
    # read reference sequences
    ref = {s.id: str(s.seq) for s in SeqIO.parse(f"{filepath}/design_files.fasta", "fasta")}
    ref_df = pd.DataFrame.from_dict(
        ref, 
        orient="index", 
        columns=["sequence"]
    ).reset_index().rename(columns={"index": "seq_id"})
    ref_df['group'] = filepath

    # read parameters for the sequences
    tot_df = pd.merge(
        ref_df, 
        pd.read_csv(
            f"{filepath}/params.csv", 
            dtype={'seq_id': str}
        ), 
        on="seq_id"
    )

    # assign label based on threshold
    def ecdf(a):
        x, counts = np.unique(a, return_counts=True)
        cusum = np.cumsum(counts)
        return x, cusum / cusum[-1]
    x, y = ecdf(tot_df['eff'])
    tot_df['cat'] = False
    tot_df.loc[tot_df['eff'] < x[np.argmax(y > 0.02)], 'cat'] = True 

    return tot_df


def get_kmer(df, kmer):
    d = {
        'seq_id': [],
        'kmer': [],
        'pos': [],
        'group': [],
        'cat': [],
    }
    for i, row in df.iterrows():
        if kmer not in row['sequence']:
            continue
        for pos in [m.start() for m in re.finditer(f'(?={kmer})', row['sequence'])]:
            d['seq_id'].append(row['seq_id'])
            d['kmer'].append(kmer)
            d['pos'].append(pos)
            d['group'].append(row['group'])
            d['cat'].append(row['cat'])
    return pd.DataFrame(d)

# Use the data of GCall and GCfix

In [None]:
df = pd.concat([compose_df("../data/internal_datasets/GCall"), compose_df("../data/internal_datasets/GCfix")])
df

In [None]:
idf = pd.concat([get_kmer(df, 'CGTGT'), get_kmer(df, 'AGACG')])

fig = px.histogram(
    idf, 
    x='pos', 
    color='cat', 
    facet_col='kmer',
    facet_col_spacing=0.05,
    histnorm='percent',
    barmode='overlay',
    color_discrete_map={True: '#de2d26', False: '#aaaaaa'},
    category_orders={'cat': [False, True]}
)

fig.update_traces(xbins_size=2)
fig.update_traces(marker=dict(line_width=0), selector=dict(type='histogram')) 
fig.update_yaxes(dtick=5, minor_dtick=2.5)
fig.update_xaxes(title="Position")

fig.update_layout(
    width=680,
    height=200,
    margin=dict(l=0, r=10, t=20, b=0),
    yaxis_title="Frequency / %",
    showlegend=False,
    bargap=0
)

fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./SI_figure_kmer_analysis/individuals.svg")

In [None]:
idf = pd.concat([get_kmer(df, 'TCGTGT'), get_kmer(df, 'AGACGT')])

fig = px.histogram(
    idf, 
    x='pos', 
    color='cat', 
    facet_col='kmer',
    facet_col_spacing=0.05,
    histnorm='percent',
    barmode='overlay',
    color_discrete_map={True: '#de2d26', False: '#aaaaaa'},
    category_orders={'cat': [False, True]}
)

fig.update_traces(xbins_size=2)
fig.update_traces(marker=dict(line_width=0), selector=dict(type='histogram')) 
fig.update_yaxes(dtick=5, minor_dtick=2.5)
fig.update_xaxes(title="Position")

fig.update_layout(
    width=680,
    height=200,
    margin=dict(l=0, r=10, t=20, b=0),
    yaxis_title="Frequency / %",
    showlegend=False,
    bargap=0
)

fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./SI_figure_kmer_analysis/individuals2.svg")