In [None]:
import pandas as pd
import numpy as np
import Bio.SeqIO as SeqIO
import plotly.express as px
import statsmodels.stats.descriptivestats

import sys
sys.path.append('../')
import plotting

# Function to collect design sequences and the parameters, then perform thresholding

In [None]:
def compose_df(filepath):
    # read reference sequences
    ref = {s.id: str(s.seq) for s in SeqIO.parse(f"{filepath}/design_files.fasta", "fasta")}
    ref_df = pd.DataFrame.from_dict(
        ref, 
        orient="index", 
        columns=["sequence"]
    ).reset_index().rename(columns={"index": "seq_id"})
    ref_df['group'] = filepath

    # read parameters for the sequences
    tot_df = pd.merge(
        ref_df, 
        pd.read_csv(
            f"{filepath}/params.csv", 
            dtype={'seq_id': str}
        ), 
        on="seq_id"
    )

    # assign label based on threshold
    def ecdf(a):
        x, counts = np.unique(a, return_counts=True)
        cusum = np.cumsum(counts)
        return x, cusum / cusum[-1]
    x, y = ecdf(tot_df['eff'])
    tot_df['cat'] = False
    tot_df.loc[tot_df['eff'] < x[np.argmax(y > 0.02)], 'cat'] = True 

    return tot_df

# Collect and combine the data from all literature datasets

In [None]:
data_df = pd.concat([
    compose_df("../data/internal_datasets/GCall"), 
    compose_df("../data/internal_datasets/GCfix"),
    compose_df("../data/external_datasets/Koch_et_al"),
    compose_df("../data/external_datasets/Erlich_et_al"),
    compose_df("../data/external_datasets/Song_et_al"),
    compose_df("../data/external_datasets/Choi_et_al"),
    compose_df("../data/external_datasets/Gao_et_al"),
])
data_df['seq_id'] = data_df['group'] + '_' + data_df['seq_id'].astype(str)
data_df

In [None]:
# pivot the dataframe so that each letter of the sequence becomes its own row
idata_df = data_df.set_index('seq_id')
idata_df = idata_df['sequence'].apply(lambda x: pd.Series(list(x))).reset_index()
idata_df = idata_df.merge(data_df[['seq_id', 'group', 'cat']], on='seq_id')
idata_df = idata_df.melt(id_vars=['seq_id', 'group', 'cat'], value_name='base', var_name='position')

# drop rows where base is NaN
idata_df = idata_df.dropna(subset=['base'])

idata_df

In [None]:
# group by group, cat and position, then get mean ratio of each base and the ratio of GC
def get_ratio(group):
    series = group['base'].value_counts(normalize=True)
    series['A'] = series.get('A', 0.0)
    series['T'] = series.get('T', 0.0)
    series['G'] = series.get('G', 0.0)
    series['C'] = series.get('C', 0.0)
    series['GC'] = series['G'] + series['C']
    return series
ratio_df = idata_df.groupby(['group', 'cat', 'position']).apply(get_ratio).reset_index()

ratio_df

In [None]:
# use a rolling mean for the proprtion of each base
ratio_df['proportion_smooth'] = ratio_df.groupby(['group', 'cat', 'base'])['proportion'].transform(lambda x: x.rolling(5, center=True, min_periods=1).mean())
ratio_df['group'] = ratio_df['group'].str.split('/').str[-1]

fig = px.line(
    ratio_df.sort_values('position'),
    x='position',
    y='proportion_smooth',
    color='cat',
    facet_row='group',
    facet_col_spacing=0.035,
    facet_row_spacing=0.05,
    facet_col='base',
    category_orders={
        'base': ['A', 'C', 'G', 'T', 'GC'],
        'group': ['GCall', 'GCfix', 'Koch_et_al', 'Erlich_et_al', 'Song_et_al', 'Choi_et_al', 'Gao_et_al'],
    },
)


fig.update_yaxes(dtick=0.25, minor_dtick=0.25/2)
fig.update_yaxes(title_text='Proportion', range=[0, 0.75], col=1)
fig.update_xaxes(matches=None)
fig.update_xaxes(title_text='Position', row=1)
fig.for_each_xaxis(lambda xaxis: xaxis.update(showticklabels=True, dtick=50, minor_dtick=25))
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

fig.update_layout(
    height=900,
    width=680,
    showlegend=False,
    margin=dict(l=0, r=20, t=20, b=0),
)

fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./SI_figure_base_analysis/literature_datasets.svg")

# Collect and combine the data from the validation experiments, but remove the sequences with deliberate motifs

In [None]:
def compose_df(filepath, motif_df):
    # read reference sequences
    ref = {s.id: str(s.seq) for s in SeqIO.parse(f"{filepath}/design_files.fasta", "fasta")}
    ref_df = pd.DataFrame.from_dict(
        ref, 
        orient="index", 
        columns=["sequence"]
    ).reset_index().rename(columns={"index": "seq_id"})
    ref_df['group'] = filepath

    # read parameters for the sequences
    tot_df = pd.merge(
        ref_df, 
        pd.read_csv(
            f"{filepath}/params.csv", 
            dtype={'seq_id': str}
        ), 
        on="seq_id"
    )

    # remove the sequences with inserted motifs
    selected_seqs = motif_df.loc[motif_df['has_insertedmotif'] == False, 'seq_id_anonymized']
    tot_df = tot_df.loc[tot_df['seq_id'].isin(selected_seqs)]

    # assign label based on threshold
    def ecdf(a):
        x, counts = np.unique(a, return_counts=True)
        cusum = np.cumsum(counts)
        return x, cusum / cusum[-1]
    x, y = ecdf(tot_df['eff'])
    tot_df['cat'] = False
    tot_df.loc[tot_df['eff'] < x[np.argmax(y > 0.02)], 'cat'] = True 

    return tot_df

In [None]:
ref_df = pd.read_csv('../43_external_validation_motifs/sequence_data_anonymized_no_duplicates.csv')

data_df = pd.concat([
    compose_df("../data/internal_datasets/validation_GCall_fix", ref_df),
    compose_df("../data/internal_datasets/validation_Erlich_et_al", ref_df),
    compose_df("../data/internal_datasets/validation_Erlich_et_al_internalrepeat", ref_df),
])
data_df['seq_id'] = data_df['group'] + '_' + data_df['seq_id'].astype(str)
data_df

In [None]:
# pivot the dataframe so that each letter of the sequence becomes its own row
idata_df = data_df.set_index('seq_id')
idata_df = idata_df['sequence'].apply(lambda x: pd.Series(list(x))).reset_index()
idata_df = idata_df.merge(data_df[['seq_id', 'group', 'cat']], on='seq_id')
idata_df = idata_df.melt(id_vars=['seq_id', 'group', 'cat'], value_name='base', var_name='position')

# drop rows where base is NaN
idata_df = idata_df.dropna(subset=['base'])

idata_df

In [None]:
# group by group, cat and position, then get mean ratio of each base and the ratio of GC
def get_ratio(group):
    series = group['base'].value_counts(normalize=True)
    series['A'] = series.get('A', 0.0)
    series['T'] = series.get('T', 0.0)
    series['G'] = series.get('G', 0.0)
    series['C'] = series.get('C', 0.0)
    series['GC'] = series['G'] + series['C']
    return series
ratio_df = idata_df.groupby(['group', 'cat', 'position']).apply(get_ratio).reset_index()

ratio_df

In [None]:
# use a rolling mean for the proprtion of each base
ratio_df['proportion_smooth'] = ratio_df.groupby(['group', 'cat', 'base'])['proportion'].transform(lambda x: x.rolling(5, center=True, min_periods=1).mean())
ratio_df['group'] = ratio_df['group'].str.split('/').str[-1]

fig = px.line(
    ratio_df.sort_values('position'),
    x='position',
    y='proportion_smooth',
    color='cat',
    facet_row='group',
    facet_col_spacing=0.035,
    facet_row_spacing=0.10,
    facet_col='base',
    category_orders={
        'base': ['A', 'C', 'G', 'T', 'GC'],
        'group': ['validation_GCall_fix', 'validation_Erlich_et_al', 'validation_Erlich_et_al_internalrepeat'],
    },
)


fig.update_yaxes(dtick=0.25, minor_dtick=0.25/2)
fig.update_yaxes(title_text='Proportion', range=[0, 0.75], col=1)
fig.update_xaxes(matches=None)
fig.update_xaxes(title_text='Position', row=1)
fig.for_each_xaxis(lambda xaxis: xaxis.update(showticklabels=True, dtick=50, minor_dtick=25))
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

fig.update_layout(
    height=400,
    width=680,
    showlegend=False,
    margin=dict(l=0, r=20, t=20, b=0),
)

fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./SI_figure_base_analysis/validation_datasets.svg")