In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

import sys
sys.path.append('..')
import plotting

## Read in population distributions for GCall and normalize

In [None]:
pop_df = pd.read_csv("../data/internal_datasets/GCall/abundance_by_experiment.csv", dtype={'seq_id': str})
pop_df.seq_id = "GCall_" + pop_df.seq_id
pop_df = pop_df.set_index('seq_id')
pop_df.index.name = None
pop_df = pop_df.div(pop_df.mean(axis=0), axis=1)
pop_df

## Read in parameter estimates for GCall

In [None]:
params_df = pd.read_csv("../data/internal_datasets/GCall/params.csv", dtype={'seq_id': str})
params_df['GC'] = "GCall"
params_df['seq_id'] = "GCall_" + params_df['seq_id']

params_df = params_df.set_index("seq_id")
params_df.index.name = None
params_df

## Join coverage data and parameter estimates

In [None]:
df = pop_df.merge(params_df, how="inner", left_index=True, right_index=True)
df

# Plot coverage distributions

In [None]:
plot_df = pd.melt(
    df.reset_index(), 
    id_vars=['index', "GC", "eff", "x0"],
    value_vars=[f'PCR{str(i).zfill(1)}' for i in range(1, 6+1)],
    var_name='PCR',
    value_name='x',
)
plot_df['n_cycles'] = plot_df['PCR'].str.extract(r'(\d+)').astype(int)*15
plot_df.loc[plot_df['x'].isna(), 'x'] = 0
plot_df.drop(plot_df.loc[~plot_df['PCR'].isin(["PCR1", "PCR6"])].index, inplace=True)

plot_df

In [None]:
fig = px.histogram(
    plot_df, 
    x='x', 
    color="n_cycles", 
    barmode="overlay",
    facet_col_spacing=0.05,
    color_discrete_map={
        15: '#737373',  
        90: '#3182bd', 
    },
    opacity=0.6,
    range_x=[0, 2.5], 
    range_y=[0, 1250], 
)
fig.update_traces(xbins=dict(start=0.0, end=2.5, size=0.05), selector=dict(type='histogram'))
fig.update_layout(
    height=160, 
    width=180, 
    margin=dict(l=0, r=10, t=5, b=0),
    showlegend=False,
)
fig.update_xaxes(
    title_text='Relative coverage', 
    dtick=1,
    minor_dtick=0.25
)
fig.update_yaxes(
    title_text='', 
    dtick=500, 
    minor_dtick=250
)
fig.update_yaxes(
    title_text='Number of sequences',
    row=1,
    col=1,
)
fig.update_traces(marker=dict(line_width=0), selector=dict(type='histogram')) 

fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./figure_2_model_data/cov_dist.svg")

# also export data
plot_df[['index', 'PCR', 'n_cycles', 'x']].to_csv("./figure_2_model_data/cov_dist.csv", index=False)

# Plot dropout frequency

In [None]:
plot_df = pd.melt(
    df.reset_index(), 
    id_vars=['index', "GC", "eff", "x0"],
    value_vars=[f'PCR{str(i).zfill(1)}' for i in range(1, 6+1)],
    var_name='PCR',
    value_name='x',
)
plot_df['n_cycles'] = plot_df['PCR'].str.extract(r'(\d+)').astype(int)*15
plot_df.loc[plot_df['x'].isna(), 'x'] = 0

dfs = []
for n_cycles in plot_df.n_cycles.unique():
    x = plot_df.loc[plot_df.n_cycles == n_cycles, 'x']
    sorted_x = np.sort(x)
    n = len(x)
    cumx = np.cumsum(sorted_x, dtype=float)
    d = dict()
    d['<0.05'] = (x < 0.05).sum()/n
    d['<0.1'] = (x < 0.1).sum()/n
    d['<0.2'] = (x < 0.2).sum()/n
    d['<0.3'] = (x < 0.3).sum()/n
    d['n_cycles'] = n_cycles
    dfs.append(d)

plot_df = pd.DataFrame(dfs)
plot_df

In [None]:
fig = px.line(
    plot_df,
    y=["<0.05", "<0.1", "<0.2", "<0.3",],
    x="n_cycles",
    color_discrete_map={
        "<0.05": '#08519c',  
        "<0.1": '#3182bd', 
        "<0.2": '#6baed6', 
        "<0.3": '#9ecae1', 
    },
    range_x=[13, 92], 
    range_y=[0, 0.035], 
    markers=True
)

fig.update_layout(
    height=160, 
    width=180, 
    margin=dict(l=0, r=10, t=5, b=0),
    showlegend=False,
)
fig.update_xaxes(
    title_text='Number of PCR cycles', 
    dtick=15,
    minor_dtick=5
)
fig.update_yaxes(
    title_text='', 
    tickformat=".0%",
    dtick=0.01, 
    minor_dtick=0.005
)
fig.update_yaxes(
    title_text='Fraction of sequences',
    row=1,
    col=1,
)

fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./figure_2_model_data/dropout.svg")

# also export data
plot_df.to_csv("./figure_2_model_data/dropout.csv", index=False)

# Plot efficiency distributions

In [None]:
fig = px.histogram(
    df, 
    x='eff',
    color_discrete_sequence=["#de2d26"],
    facet_col_spacing=0.05,
    range_x=[0.94, 1.02], 
    range_y=[0, 1250], 
)
fig.update_traces(xbins=dict(start=0.70, end=1.1, size=0.001), selector=dict(type='histogram'))
fig.update_layout(
    height=160, 
    width=180, 
    margin=dict(l=0, r=10, t=5, b=0),
    showlegend=False,
)
fig.update_xaxes(
    title_text='Relative PCR efficiency', 
    dtick=0.02,
    minor_dtick=0.01
)
fig.update_yaxes(
    title_text='', 
    dtick=500, 
    minor_dtick=250
)
fig.update_yaxes(
    title_text='Number of sequences',
    row=1,
    col=1,
)
fig.update_traces(marker=dict(line_width=0), selector=dict(type='histogram')) 

fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./figure_2_model_data/eff_dist.svg")

# also export data
df[['eff', 'x0']].to_csv("./figure_2_model_data/eff_dist.csv", index=True)

In [None]:
from plotly.subplots import make_subplots

counts, bins = np.histogram(df['eff'], bins=np.arange(0.70, 1.1, 0.01))
bins = 0.5 * (bins[:-1] + bins[1:])
bar1 = go.Bar(
    x=bins,
    y=counts,
    marker_line=dict(width=0, color='black'),
    marker_color='#de2d26',
)

counts, bins = np.histogram(df['eff'], bins=np.arange(0.70, 1.1, 0.002))
bins = 0.5 * (bins[:-1] + bins[1:])
bar2 = go.Bar(
    x=bins,
    y=counts,
    marker_line=dict(width=0, color='black'),
    marker_color='#de2d26',
)

intervals = [0.775, 0.928, 0.97]
fig = make_subplots(
    rows=1,
    cols=2,
    horizontal_spacing=0.03,
    shared_yaxes=True,
    column_widths=[0.35, 0.65]
)
fig.append_trace(bar1, row=1, col=1)
fig.append_trace(bar2, row=1, col=2)
fig.update_xaxes(range=[intervals[0], intervals[1]], row=1, col=1)
fig.update_yaxes(range=[0, 20])
fig.update_yaxes(visible=False, row=1, col=2)
fig.update_xaxes(range=[intervals[1], intervals[2]], row=1, col=2)

fig.update_layout(barmode='group', bargap=0.0, bargroupgap=0.0)
fig.update_layout(
    height=100, 
    width=75, 
    margin=dict(l=15, r=0, t=10, b=0),
    showlegend=False,
)
fig.update_xaxes(
    title_text='', 
)
fig.update_yaxes(
    title_text='', 
    dtick=10,
    minor_dtick=5
)

fig.update_xaxes(
    dtick=0.1,
    minor_dtick=0.025,
    row=1,
    col=1,
)
fig.update_xaxes(
    dtick=0.02,
    minor_dtick=0.01,
    row=1,
    col=2,
)
fig.update_layout(
    template="simple_white",
    font_family="Inter",
    legend_font_size=20/3,
)
fig.update_yaxes(
    minor_ticks="outside", 
    title_font_family="Inter", 
    title_font_size=20/3, 
    tickfont_size=20/3, 
)
fig.update_xaxes(
    minor_ticks="outside", 
    title_font_family="Inter", 
    title_font_size=20/3, 
    tickfont_size=20/3, 
)
fig.for_each_annotation(lambda a: a.update(
    font_size=20/3,
    font_family="Inter",
))

fig.show()
fig.write_image("./figure_2_model_data/eff_dist_inset.svg")

# Plot initial abundance distributions

### Collect reference data for biased pool from Chen et al. (DOI: 10.1038/s41467-020-16958-3)

In [None]:
counts36 = np.load("./run36.npy")
density, bins = np.histogram(counts36, bins=np.arange(0, 200, 1))
bins = 0.5*(bins[:-1] + bins[1:])
bins /= counts36.mean() # convert to relative coverage
ref_stepsize = 1/counts36.mean()
density = density / density.sum()

In [None]:
exp_stepsize = 0.07
total_seqs = 12000

fig = px.histogram(
    df, 
    x='x0',
    color_discrete_sequence=["#de2d26"],
    facet_col_spacing=0.05,
    range_x=[0, 2.5], 
    range_y=[0, 1250], 
)
fig.update_traces(xbins=dict(start=0, end=5, size=exp_stepsize), selector=dict(type='histogram'))
fig.update_layout(
    height=160, 
    width=180, 
    margin=dict(l=0, r=10, t=5, b=0),
    showlegend=False,
)
fig.update_xaxes(
    title_text='Relative initial abundance', 
    dtick=1,
    minor_dtick=0.25
)
fig.update_yaxes(
    title_text='', 
    dtick=500, 
    minor_dtick=250
)
fig.add_trace(go.Scatter(
    x=bins, 
    y=density*(exp_stepsize/ref_stepsize*total_seqs), # we correct the density to match the histogram
    mode="lines", 
    line=dict(color="black", width=1, dash="dot")),
)
fig.update_yaxes(title_text='Number of sequences')
fig.update_traces(marker=dict(line_width=0), selector=dict(type='histogram')) 

fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./figure_2_model_data/x0_dist.svg")

# also export data
# internal data was already exported with the efficiency plot above