In [None]:
import pandas as pd
import numpy as np
import Bio.SeqIO as SeqIO
import plotly.express as px
import statsmodels.stats.descriptivestats

import sys
sys.path.append('../')
import plotting

# Function to collect design sequences and the parameters, then perform thresholding

In [None]:
def compose_df(filepath):
    # read reference sequences
    ref = {s.id: str(s.seq) for s in SeqIO.parse(f"{filepath}/design_files.fasta", "fasta")}
    ref_df = pd.DataFrame.from_dict(
        ref, 
        orient="index", 
        columns=["sequence"]
    ).reset_index().rename(columns={"index": "seq_id"})
    ref_df['group'] = filepath

    # read parameters for the sequences
    tot_df = pd.merge(
        ref_df, 
        pd.read_csv(
            f"{filepath}/params.csv", 
            dtype={'seq_id': str}
        ), 
        on="seq_id"
    )

    # assign label based on threshold
    def ecdf(a):
        x, counts = np.unique(a, return_counts=True)
        cusum = np.cumsum(counts)
        return x, cusum / cusum[-1]
    x, y = ecdf(tot_df['eff'])
    tot_df['cat'] = False
    tot_df.loc[tot_df['eff'] < x[np.argmax(y > 0.02)], 'cat'] = True 

    return tot_df

# Collect and combine the data from GCall and GCfix

In [None]:
data_df = pd.concat([compose_df("../data/internal_datasets/GCall"), compose_df("../data/internal_datasets/GCfix")])
data_df

# Function to collect the kmers and count by category

In [None]:
def get_kmers_by_cat(df, kmer_length=5):
    results = {}

    # for each sequence
    for row in df.iterrows():
        row = row[1]
        seq = row["sequence"]
        d = results.get(row["cat"], {})

        # for each kmer in the sequence
        for i in range(len(seq) - kmer_length + 1):
            # increase counter
            kmer = seq[i:i+kmer_length]
            d[kmer] = d.get(kmer, 0) + 1

        results[row["cat"]] = d

    # create df and normalize to the category size
    idf = pd.DataFrame.from_dict(results).fillna(0).reset_index().rename(columns={"index": "kmer"})
    idf['factor'] = idf[True] / idf[False]
    idf['norm_factor'] = idf['factor'] / (df[df["cat"]].shape[0] / df.shape[0])
    return idf

# Get kmer data for 4-, 5-, 6-mers

In [None]:
kmer_sizes = [4, 5, 6]
kmer_dfs = [get_kmers_by_cat(data_df, kmer_length=k) for k in kmer_sizes]
kmer_df = pd.concat(kmer_dfs)

# add log2 factor
kmer_df['log2_norm_factor'] = np.log2(kmer_df['norm_factor'])

# add info for base and length, and sort
kmer_df['base'] = kmer_df['kmer'].str[0]
kmer_df['length'] = kmer_df['kmer'].str.len()
kmer_df.sort_values("kmer", inplace=True)

kmer_df

# Plotting

In [None]:
colors = {"A": "#31a354", "C": "#3182bd", "G": "#fd8d3c", "T": "#de2d26"}

fig = px.scatter(
    kmer_df,
    x="kmer",
    y="log2_norm_factor",
    color="base",
    color_discrete_map=colors,
    render_mode="svg",
)

fig.update_traces(marker=dict(size=4, opacity=1))

fig.update_xaxes(type='category', showticklabels=False, range=[-50, kmer_df.shape[0]+50])
fig.update_yaxes(range=[-4, 4])


stats = statsmodels.stats.descriptivestats.describe(kmer_df["log2_norm_factor"].values)[0]
threshold = [stats['mean']-5*stats['std'], stats['mean']+5*stats['std']]

fig.add_hline(y=0, line_color="black", line_width=1, opacity=1)
fig.add_hline(y=threshold[0], line_dash="dash", line_color="black", line_width=1, opacity=1)
fig.add_hline(y=threshold[1], line_dash="dash", line_color="black", line_width=1, opacity=1)

fig.update_layout(
    xaxis_title="",
    yaxis_title="log2 enrichment in poor sequences",
    showlegend=False,
    width=320,
    height=150,
    margin=dict(l=0, r=5, t=5, b=0),
)
fig = plotting.standardize_plot(fig)

# add sequences above upper threshold
upper_hits = kmer_df[kmer_df["log2_norm_factor"] >= threshold[1]]
display(upper_hits)
for i, row in upper_hits.iterrows():
    fig.add_annotation(
        x=row["kmer"],
        y=row["log2_norm_factor"],
        text=row["kmer"],
        font_color=colors[row["base"]],
        showarrow=False,
        yshift=10,
    )

# add sequences below lower threshold
upper_hits = kmer_df[kmer_df["log2_norm_factor"] <= threshold[0]]
display(upper_hits)
for i, row in upper_hits.iterrows():
    fig.add_annotation(
        x=row["kmer"],
        y=row["log2_norm_factor"],
        text=row["kmer"],
        font_color=colors[row["base"]],
        showarrow=False,
        yshift=-10,
    )

fig.for_each_annotation(lambda a: a.update(
    font_size=24/3,
    font_family="Inter",
))

fig.show()
fig.write_image("./SI_figure_kmer_analysis/kmers.svg")

# save data
kmer_df.to_csv("./SI_figure_kmer_analysis/kmers.csv", index=False)