# Assembler rubustness - Contiguity

## Imports

In [7]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np
from statistics import mean

## Global variables

In [3]:
METRICS_COLUMNS_GLOBAL = ['run','sample','assembler', 'contigs', 'basepairs', 'max_contig', 'Ns', 'n50', 'misassemblies', 'mapped_reads']
ASSEMBLER_PROCESS_LIST = ["BCALM2", "GATBMINIAPIPELINE", "MINIA", "MEGAHIT", "METASPADES", "UNICYCLER", "SPADES",
                          "SKESA", "VELVETOPTIMIZER", "IDBA"]
PROCESS_TO_NAME = {"BCALM2": "BCALM2", 
                   "GATBMINIAPIPELINE": "GATBMiniaPipeline",
                   "MINIA": "MINIA",
                   "MEGAHIT": "MEGAHIT", 
                   "METASPADES": "metaSPAdes", 
                   "UNICYCLER": "Unicycler", 
                   "SPADES": "SPAdes",
                    "SKESA": "SKESA",
                   "VELVETOPTIMIZER": "VelvetOptimizer",
                   "IDBA": "IDBA-UD"}
log_distributed = ['LHS', 'LNN', 'ERR2935805']
even_distribution = ['EHS', 'LNN', 'ERR2984773']
COLOURS = ['#004B93', "#009392", "#39B185", "#9CCB86", "#E9E29C", "#EEB479", "#E88471","#CF597E",'lightgray', 'darkgray']

## Lx Metric

### Load data

In [4]:
_files = glob.glob('../Results/*/results/*/stats/*_lx.csv')

df_list = []
for f in _files:
    run = f.split('/')[-5]
    sample = f.split('/')[-3]
    assembler = f.split('/')[-1].split('_')[1]
    df = pd.read_csv(f)
    df['run'] = run
    df['Sample'] = sample
    df['Assembler'] = assembler
    df_list.append(df)

df = pd.concat(df_list)
df['distribution'] = np.where(df['Sample'].isin(log_distributed), 'Log', 'Even')
df

Unnamed: 0.1,Unnamed: 0,Reference,Assembler,Lx,nContigs,run,Sample,distribution
0,0,Bacillus_subtilis,SPAdes,0.000000,0.0,run1,LNN,Log
1,1,Bacillus_subtilis,SPAdes,0.111111,0.0,run1,LNN,Log
2,2,Bacillus_subtilis,SPAdes,0.222222,1.0,run1,LNN,Log
3,3,Bacillus_subtilis,SPAdes,0.333333,2.0,run1,LNN,Log
4,4,Bacillus_subtilis,SPAdes,0.444444,4.0,run1,LNN,Log
...,...,...,...,...,...,...,...,...
115,115,Staphylococcus_aureus_plasmid3,SKESA,0.555556,0.0,run2,ERR2984773,Even
116,116,Staphylococcus_aureus_plasmid3,SKESA,0.666667,0.0,run2,ERR2984773,Even
117,117,Staphylococcus_aureus_plasmid3,SKESA,0.777778,0.0,run2,ERR2984773,Even
118,118,Staphylococcus_aureus_plasmid3,SKESA,0.888889,0.0,run2,ERR2984773,Even


### Create Average, Upper bound and Lower bound variables for plot

In [23]:
plot_df = pd.DataFrame(columns=['Reference', 'Assembler', 'Sample', "Average", "Upper bound", "Lower bound", 'Lx'])

for sample in sorted(df['Sample'].unique(), key=lambda v: v.upper(), reverse=True):
    for reference in sorted(df['Reference'].unique(), key=lambda v: v.upper(), reverse=True):
        for assembler in sorted(df['Assembler'].unique(), key=lambda v: v.upper(), reverse=True):
            for x in df['Lx'].unique():
                contigs = list(df['nContigs'][(df['Assembler'] == assembler) & (df['Reference'] == reference) & (df['Sample'] == sample) & (df['Lx'] == x)])
                if contigs:
                    avg = mean(contigs) if len(set(contigs)) > 1 else contigs[0]
                    upper_bound = max(contigs)
                    lower_bound = min(contigs)
                    plot_df = plot_df.append({'Reference':reference, 'Assembler': assembler, 'Sample': sample, "Average": avg, "Upper bound": upper_bound, "Lower bound": lower_bound, 'Lx': x}, ignore_index=True)

plot_df

Unnamed: 0,Reference,Assembler,Sample,Average,Upper bound,Lower bound,Lx
0,Staphylococcus_aureus_plasmid3,Unicycler,LNN,0.0,0.0,0.0,0.000000
1,Staphylococcus_aureus_plasmid3,Unicycler,LNN,0.0,0.0,0.0,0.111111
2,Staphylococcus_aureus_plasmid3,Unicycler,LNN,0.0,0.0,0.0,0.222222
3,Staphylococcus_aureus_plasmid3,Unicycler,LNN,0.0,0.0,0.0,0.333333
4,Staphylococcus_aureus_plasmid3,Unicycler,LNN,0.0,0.0,0.0,0.444444
...,...,...,...,...,...,...,...
6835,Bacillus_subtilis,BCALM2,EHS,1000.0,1000.0,1000.0,0.555556
6836,Bacillus_subtilis,BCALM2,EHS,1374.0,1374.0,1374.0,0.666667
6837,Bacillus_subtilis,BCALM2,EHS,,,,0.777778
6838,Bacillus_subtilis,BCALM2,EHS,,,,0.888889


### Plot data

In [35]:
for sample in sorted(plot_df['Sample'].unique(), key=lambda v: v.upper(), reverse=True):
    print('-' + sample)
    fig=make_subplots(rows=6, cols=2, column_titles=sorted(df['Reference'].unique(), key=lambda v: v.upper(), reverse=True),
                     shared_yaxes=True, shared_xaxes=True, x_title="Lx", y_title="Contigs")
    row=1
    col=1
    for reference in sorted(plot_df['Reference'].unique(), key=lambda v: v.upper(), reverse=True):
        print('--' + reference)
        i=0
        print(row, col)
        for assembler in sorted(plot_df['Assembler'].unique(), key=lambda v: v.upper(), reverse=True):
            print('---' + assembler)
            showlegend=True if col==1 and row==1 else False
            x = list(plot_df['Lx'][(plot_df['Assembler'] == assembler) & (plot_df['Reference'] == reference) & (plot_df['Sample'] == sample)])
            if x:
                y=list(plot_df['Average'][(plot_df['Assembler'] == assembler) & (plot_df['Reference'] == reference) & (plot_df['Sample'] == sample)])
                up_bound=list(plot_df['Upper bound'][(plot_df['Assembler'] == assembler) & (plot_df['Reference'] == reference) & (plot_df['Sample'] == sample)])
                low_bound=list(plot_df['Lower bound'][(plot_df['Assembler'] == assembler) & (plot_df['Reference'] == reference) & (plot_df['Sample'] == sample)])

                fig.add_trace(go.Scatter(name=assembler,
                                         x=x,
                                         y=y,
                                         mode='lines',
                                         line=dict(color=COLOURS[i]),
                                         showlegend=showlegend), col=col, row=row)
                fig.add_trace(go.Scatter(name='Upper Bound',
                                         x=x,
                                         y=up_bound,
                                         mode='lines',
                                         marker=dict(color=COLOURS[i]),line=dict(width=0),showlegend=False),col=col, row=row)
                fig.add_trace(go.Scatter(name='Lower Bound',
                                         x=x,
                                         y=low_bound,
                                         marker=dict(color=COLOURS[i]),
                                         line=dict(width=0),
                                         mode='lines',
                                         fillcolor=COLOURS[i],
                                         fill='tonexty',showlegend=False),col=col, row=row)
            i+=1
        if col == 2:
            col = 1
            row += 1
        else:
            col += 1
    fig.update_layout(plot_bgcolor='rgb(255,255,255)', title="{} Lx metric variation".format(sample))
    fig.update_xaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', rangemode='tozero')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', range=[0, 1])
    for i in fig['layout']['annotations']:
        i['font']['size'] = 12
    fig.show()
    plot(fig, filename='Plots/Contiguity/Lx/{}.html'.format(sample), auto_open=False)

-LNN
--Staphylococcus_aureus_plasmid3
1 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus_plasmid2
1 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus_plasmid1
2 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus
2 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Salmonella_enterica
3 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Pseudomonas_aeruginosa
3 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Listeria_monocytogenes
4 1
---VelvetO

-LHS
--Staphylococcus_aureus_plasmid3
1 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus_plasmid2
1 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus_plasmid1
2 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus
2 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Salmonella_enterica
3 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Pseudomonas_aeruginosa
3 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Listeria_monocytogenes
4 1
---VelvetO

-ERR2984773
--Staphylococcus_aureus_plasmid3
1 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus_plasmid2
1 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus_plasmid1
2 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus
2 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Salmonella_enterica
3 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Pseudomonas_aeruginosa
3 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Listeria_monocytogenes
4 1
---

-ERR2935805
--Staphylococcus_aureus_plasmid3
1 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus_plasmid2
1 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus_plasmid1
2 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus
2 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Salmonella_enterica
3 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Pseudomonas_aeruginosa
3 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Listeria_monocytogenes
4 1
---

-ENN
--Staphylococcus_aureus_plasmid3
1 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus_plasmid2
1 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus_plasmid1
2 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus
2 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Salmonella_enterica
3 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Pseudomonas_aeruginosa
3 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Listeria_monocytogenes
4 1
---VelvetO

-EHS
--Staphylococcus_aureus_plasmid3
1 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus_plasmid2
1 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus_plasmid1
2 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Staphylococcus_aureus
2 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Salmonella_enterica
3 1
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Pseudomonas_aeruginosa
3 2
---VelvetOptimizer
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
--Listeria_monocytogenes
4 1
---VelvetO