In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

import sys
sys.path.append('..')
import plotting

datasets = ['validation_GCall_fix', 'validation_Erlich_et_al', 'validation_Erlich_et_al_internalrepeat']
dataset_colors = {
    'validation_GCall_fix': '#de2d26',  
    'validation_Erlich_et_al': '#3182bd', 
    'validation_Erlich_et_al_internalrepeat': '#bdd7e7', 
}

## Read in population distributions for all validation experiments

In [None]:
coverage_by_dataset_by_ex = {}

pop_dfs = []
for exp in datasets:
    pop_df = pd.read_csv(f"../data/internal_datasets/{exp}/abundance_by_experiment.csv", dtype={'seq_id': str})
    pop_df.seq_id = exp + "_" + pop_df.seq_id
    pop_df = pop_df.set_index('seq_id')
    pop_df.index.name = None
    coverage_by_dataset_by_ex[exp] = pop_df.mean(axis=0)/2
    pop_df = pop_df.div(pop_df.mean(axis=0), axis=1)
    pop_df['group'] = exp
    pop_dfs.append(pop_df)

pop_df = pd.concat(pop_dfs)
pop_df

In [None]:
coverage_by_dataset_by_ex

## Read in parameter estimates for the datasets

In [None]:
params_dfs = []
for exp in datasets:
    params_df = pd.read_csv(f"../data/internal_datasets/{exp}/params.csv", dtype={'seq_id': str})
    params_df.seq_id = exp + "_" + params_df.seq_id
    params_df = params_df.set_index('seq_id')
    params_df.index.name = None
    params_dfs.append(params_df)

params_df = pd.concat(params_dfs)
params_df

## Join coverage data and parameter estimates

In [None]:
df = pop_df.merge(params_df, how="inner", left_index=True, right_index=True)
df

## Plot coverage distributions

In [None]:
plot_df = pd.melt(
    df.reset_index(), 
    id_vars=['index', "eff", "x0", "group"],
    value_vars=[f'PCR{str(i).zfill(1)}' for i in range(1, 6+1)],
    var_name='PCR',
    value_name='x',
)
plot_df['n_cycles'] = plot_df['PCR'].str.extract(r'(\d+)').astype(int)*15
plot_df.loc[plot_df['x'].isna(), 'x'] = 0

plot_df

In [None]:
fig = px.histogram(
    plot_df, 
    x='x', 
    facet_col="n_cycles",
    facet_row="group",
    color="group",
    facet_col_spacing=0.02,
    facet_row_spacing=0.075,
    color_discrete_map=dataset_colors,
    range_x=[0, 2.5], 
    range_y=[0, 2000], 
)
fig.update_traces(xbins=dict(start=0.0, end=2.5, size=0.1), selector=dict(type='histogram'))
fig.update_layout(
    height=300, 
    width=640, 
    margin=dict(l=0, r=10, t=20, b=0),
    showlegend=False,
)
fig.for_each_annotation(lambda a: a.update(text=f"{a.text.split("=")[-1]} cycles"))
fig.update_xaxes(
    dtick=1,
    minor_dtick=0.25
)
fig.update_xaxes(title_text='Coverage', row=1)
fig.update_yaxes(
    title_text='', 
    dtick=500, 
    minor_dtick=250
)
fig.update_yaxes(title_text='# Sequences', col=1)
fig.update_traces(marker=dict(line_width=0), selector=dict(type='histogram')) 

for i, (exp, data) in enumerate(coverage_by_dataset_by_ex.items()):
    for j in range(6):
        fig.add_annotation(
            x=2, 
            y=1500, 
            text=f"n = {data.iloc[j]:.0f}", 
            showarrow=False,
            col=j+1,
            row=3-i, 
        )



fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./SI_figure_externalval_pool_comparison/cov_dist.svg")

# export data as well
plot_df.to_csv("./SI_figure_externalval_pool_comparison/cov_dist.csv", index=False)

## Plot efficiency distributions

In [None]:
plot_df = df.copy()

plot_df

In [None]:
fig = px.histogram(
    plot_df, 
    x='eff',
    color="group",
    facet_row="group",
    color_discrete_map=dataset_colors,
    facet_row_spacing=0.05,
)
fig.update_traces(xbins=dict(start=0.70, end=1.1, size=0.001), selector=dict(type='histogram'))
fig.update_layout(
    height=300, 
    width=300, 
    margin=dict(l=0, r=10, t=5, b=0),
    showlegend=False,
)
fig.update_xaxes(
    dtick=0.05,
    minor_dtick=0.025
)
fig.update_xaxes(title_text='Relative PCR efficiency', row=1)
fig.update_yaxes(
    title_text='', 
    dtick=500, 
    minor_dtick=250
)
fig.update_yaxes(
    title_text='# Sequences',
    col=1,
)
fig.update_traces(marker=dict(line_width=0), selector=dict(type='histogram')) 



fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./SI_figure_externalval_pool_comparison/eff_dist.svg")

In [None]:
for exp in datasets:
    fig = px.histogram(
        plot_df.loc[plot_df['group'] == exp], 
        x='eff',
        color="group",
        facet_row="group",
        color_discrete_map=dataset_colors,
        facet_row_spacing=0.05,
    )
    fig.update_traces(xbins=dict(start=0.70, end=1.1, size=0.001), selector=dict(type='histogram'))
    fig.update_layout(
        height=75, 
        width=175, 
        margin=dict(l=0, r=5, t=10, b=20),
        showlegend=False,
    )
    fig.update_xaxes(
        range=[0.9, 1.0],
        dtick=0.05,
        minor_dtick=0.01
    )
    fig.update_xaxes(title_text='', row=1)
    fig.update_yaxes(
        range=[0, 100],
        title_text='', 
        dtick=100, 
        minor_dtick=25
    )
    fig.update_yaxes(
        title_text='',
        col=1,
    )
    fig.update_traces(marker=dict(line_width=0), selector=dict(type='histogram')) 


    fig.for_each_annotation(lambda a: a.update(text=f""))
    fig = plotting.standardize_plot(fig)
    fig.show()
    fig.write_image(f"./SI_figure_externalval_pool_comparison/eff_dist_inset_{exp}.svg")

In [None]:
idf = df.loc[df.group == 'validation_Erlich_et_al'].copy()
idf[idf.eff < 0.98].shape[0]/idf.shape[0]

## Plot initial abundance distributions

In [None]:
plot_df = df.copy()

plot_df

In [None]:
fig = px.histogram(
    plot_df, 
    x='x0',
    color="group",
    facet_row="group",
    color_discrete_map=dataset_colors,
    facet_row_spacing=0.05,
)
fig.update_traces(xbins=dict(start=0, end=4, size=0.05), selector=dict(type='histogram'))
fig.update_layout(
    height=300, 
    width=300, 
    margin=dict(l=0, r=10, t=5, b=0),
    showlegend=False,
)
fig.update_xaxes(
    dtick=1,
    minor_dtick=0.5
)
fig.update_xaxes(title_text='Relative initial abundance', row=1)
fig.update_yaxes(
    title_text='', 
    dtick=500, 
    minor_dtick=250
)
fig.update_yaxes(
    title_text='# Sequences',
    col=1,
)
fig.update_traces(marker=dict(line_width=0), selector=dict(type='histogram')) 



fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./SI_figure_externalval_pool_comparison/x0_dist.svg")

# Get correlations between experiments

In [None]:
corr_df = df.reset_index()
corr_df['index'] = corr_df['index'].str.split('_').str[-1]

### Initial abundance

In [None]:
corr_df.pivot(index='index', columns='group', values='x0').corr(method='spearman')

### Efficiency

In [None]:
corr_df.pivot(index='index', columns='group', values='eff').corr(method='spearman')