# Assembler rubustness - Breadth of Coverage

## Imports

In [14]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np
from statistics import mean

## Global variables

In [15]:
ASSEMBLER_PROCESS_LIST = ["ABYSS", "BCALM2", "GATBMINIAPIPELINE", "METAHIPMER2", "MINIA", "MEGAHIT", "METASPADES", "UNICYCLER", "SPADES",
                          "SKESA", "VELVETOPTIMIZER", "IDBA"]
PROCESS_TO_NAME = {"ABYSS": "ABySS",
                   "BCALM2": "BCALM2", 
                   "GATBMINIAPIPELINE": "GATBMiniaPipeline",
                   "METAHIPMER2": "MetaHipMer2",
                   "MINIA": "MINIA",
                   "MEGAHIT": "MEGAHIT", 
                   "METASPADES": "metaSPAdes", 
                   "UNICYCLER": "Unicycler", 
                   "SPADES": "SPAdes",
                    "SKESA": "SKESA",
                   "VELVETOPTIMIZER": "VelvetOptimizer",
                   "IDBA": "IDBA-UD"}

log_distributed = ['LHS', 'LNN', 'ERR2935805']
even_distribution = ['EMS', 'LNN', 'ERR2984773']

COLOURS = ['#5876c8', '#58AEC8', '#009392', '#39B185', '#9CCB86', '#E9E29C', '#EEB479', '#E88471', '#CF597E', '#a54765', '#a42a2a', '#835221']

## Load data

In [16]:
_files = glob.glob('../Results/*/*/results/*/stats/*_breadth_of_coverage_contigs.csv')

df_list = []
for f in _files:
    run = f.split('/')[-5]
    sample = f.split('/')[-3]
    assembler = f.split('/')[-1].split('_')[1]
    df = pd.read_csv(f)
    df['run'] = run
    df['Sample'] = sample
    df['Assembler'] = assembler
    df_list.append(df)

df = pd.concat(df_list)
df['distribution'] = np.where(df['Sample'].isin(log_distributed), 'Log', 'Even')
df

Unnamed: 0.1,Unnamed: 0,Reference,Breadth of Coverage,Contigs,run,Sample,Assembler,distribution
0,0,Bacillus_subtilis,0.976159,205,run3,LNN,BCALM2,Log
1,1,Enterococcus_faecalis,0.000000,0,run3,LNN,BCALM2,Log
2,2,Escherichia_coli_plasmid,0.000000,0,run3,LNN,BCALM2,Log
3,3,Escherichia_coli,0.000000,0,run3,LNN,BCALM2,Log
4,4,Lactobacillus_fermentum,0.001233,2,run3,LNN,BCALM2,Log
...,...,...,...,...,...,...,...,...
7,7,Salmonella_enterica,0.977899,145,run2,ERR2984773,metaSPAdes,Even
8,8,Staphylococcus_aureus,0.983494,49,run2,ERR2984773,metaSPAdes,Even
9,9,Staphylococcus_aureus_plasmid1,0.979012,2,run2,ERR2984773,metaSPAdes,Even
10,10,Staphylococcus_aureus_plasmid2,1.000000,1,run2,ERR2984773,metaSPAdes,Even


In [11]:
len(df["Reference"].unique())

12

## Plot data

### Per sample

In [12]:
for sample in sorted(df['Sample'].unique(), key=lambda v: v.upper(), reverse=True):
    print('-' + sample)
    fig=make_subplots(rows=3, cols=4, subplot_titles=sorted(df['Reference'].unique(), key=lambda v: v.upper(), reverse=False),
                     shared_yaxes=True, shared_xaxes=True, x_title="Contigs", y_title="Breadth of Coverage")
    row=1
    col=1
    for reference in sorted(df['Reference'].unique(), key=lambda v: v.upper(), reverse=False):
        print('--' + reference)
        i=0
        print(row, col)
        for assembler in sorted(df['Assembler'].unique(), key=lambda v: v.upper(), reverse=True):
            print('---' + assembler)
            showlegend=True if col==1 and row==1 else False
            contigs = list(df['Contigs'][(df['Assembler'] == assembler) & (df['Reference'] == reference) & (df['Sample'] == sample)])
            boc = list(df['Breadth of Coverage'][(df['Assembler'] == assembler) & (df['Reference'] == reference) & (df['Sample'] == sample)])
            if contigs: #list not empty
                fig.add_trace(go.Scatter(x=[mean(contigs)],
                                         y=[mean(boc)],
                                         error_y=dict(type='data', # value of error bar given in data coordinates
                                                      symmetric=False,
                                                      array=[max(boc)-mean(boc)],
                                                      arrayminus=[mean(boc)-min(boc)],
                                                      visible=True),
                                         error_x=dict(type='data', # value of error bar given in data coordinates
                                                      symmetric=False,
                                                      array=[max(contigs)-mean(contigs)],
                                                      arrayminus=[mean(contigs)-min(contigs)],
                                                      visible=True),
                                         name=assembler, marker=dict(color=COLOURS[i], size=12), showlegend=showlegend),
                              col=col, row=row)
            i+=1
        if col == 4:
            col = 1
            row += 1
        else:
            col += 1
    fig.update_layout(plot_bgcolor='rgb(255,255,255)', title="{} Genome Coverage variation".format(sample))
    fig.update_xaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', rangemode='tozero')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', range=[0, 1])
    for i in fig['layout']['annotations']:
        i['font']['size'] = 12
    fig.update_layout(legend=dict(
        orientation="h",
        y=-0.1,
        x=0
    ))
    fig.show()
    plot(fig, filename='Plots/Genome Fragmentation/{}.html'.format(sample), auto_open=False)

-LNN
--Bacillus_subtilis
1 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Enterococcus_faecalis
1 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Escherichia_coli
1 3
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Escherichia_coli_plasmid
1 4
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Lactobacillus_fermentum
2 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Listeria_monocytogenes
2 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
-

-LHS
--Bacillus_subtilis
1 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Enterococcus_faecalis
1 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Escherichia_coli
1 3
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Escherichia_coli_plasmid
1 4
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Lactobacillus_fermentum
2 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Listeria_monocytogenes
2 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
-

-ERR2984773
--Bacillus_subtilis
1 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Enterococcus_faecalis
1 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Escherichia_coli
1 3
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Escherichia_coli_plasmid
1 4
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Lactobacillus_fermentum
2 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Listeria_monocytogenes
2 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaS

-ERR2935805
--Bacillus_subtilis
1 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Enterococcus_faecalis
1 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Escherichia_coli
1 3
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Escherichia_coli_plasmid
1 4
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Lactobacillus_fermentum
2 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Listeria_monocytogenes
2 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaS

-ENN
--Bacillus_subtilis
1 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Enterococcus_faecalis
1 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Escherichia_coli
1 3
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Escherichia_coli_plasmid
1 4
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Lactobacillus_fermentum
2 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Listeria_monocytogenes
2 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
-

-EMS
--Bacillus_subtilis
1 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Enterococcus_faecalis
1 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Escherichia_coli
1 3
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Escherichia_coli_plasmid
1 4
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Lactobacillus_fermentum
2 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Listeria_monocytogenes
2 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
-

### All Samples - Even only

In [13]:
fig=make_subplots(rows=3, cols=4, subplot_titles=[x.replace('_', ' ') for x in sorted(df['Reference'].unique(), key=lambda v: v.upper(), reverse=False)],
                 shared_yaxes=True, shared_xaxes=True, x_title="Contigs", y_title="Breadth of Coverage")
row=1
col=1
for reference in sorted(df['Reference'].unique(), key=lambda v: v.upper(), reverse=False):
    print('--' + reference)
    i=0
    print(row, col)
    for assembler in sorted(df['Assembler'].unique(), key=lambda v: v.upper(), reverse=True):
        print('---' + assembler)
        showlegend=True if col==1 and row==1 else False
        
        contigs = list(df['Contigs'][(df['Assembler'] == assembler) & (df['Reference'] == reference) & (df['distribution'] == 'Even')])
        boc = list(df['Breadth of Coverage'][(df['Assembler'] == assembler) & (df['Reference'] == reference) & (df['distribution'] == 'Even')])
        if contigs: #list not empty
            fig.add_trace(go.Scatter(x=[mean(contigs)],
                                     y=[mean(boc)],
                                     error_y=dict(type='data', # value of error bar given in data coordinates
                                                  symmetric=False,
                                                  array=[max(boc)-mean(boc)],
                                                  arrayminus=[mean(boc)-min(boc)],
                                                  visible=True),
                                     error_x=dict(type='data', # value of error bar given in data coordinates
                                                  symmetric=False,
                                                  array=[max(contigs)-mean(contigs)],
                                                  arrayminus=[mean(contigs)-min(contigs)],
                                                  visible=True),
                                     name=assembler, marker=dict(color=COLOURS[i], opacity=0.6, size=12), showlegend=showlegend),
                          col=col, row=row)
        i+=1
    if col == 4:
        col = 1
        row += 1
    else:
        col += 1
        
fig.update_layout(plot_bgcolor='rgb(255,255,255)', title="Genome fragmentation variation")
fig.update_xaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', rangemode='tozero')
fig.update_yaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', range=[0, 1])
for i in fig['layout']['annotations']:
    i['font']['size'] = 12
fig.update_layout(legend=dict(
    orientation="h",
    y=-0.1,
    x=0
))
fig.show()
plot(fig, filename='Plots/Genome Fragmentation/Figure 5 - Genome fragmentation for even samples.html', auto_open=False)

--Bacillus_subtilis
1 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Enterococcus_faecalis
1 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Escherichia_coli
1 3
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Escherichia_coli_plasmid
1 4
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Lactobacillus_fermentum
2 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
--Listeria_monocytogenes
2 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---Met

'Plots/Genome Fragmentation/Figure 5 - Genome fragmentation for even samples.html'