# 4. Assembler rubustness - Breadth of Coverage

## Imports

In [1]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np
from statistics import mean

## Global variables

In [2]:
ASSEMBLER_PROCESS_LIST = ["ABYSS", "BCALM2", "GATBMINIAPIPELINE", "METAHIPMER2", "MINIA", "MEGAHIT", "METASPADES", "UNICYCLER", "SPADES",
                          "SKESA", "VELVETOPTIMIZER", "IDBA"]
PROCESS_TO_NAME = {"ABYSS": "ABySS",
                   "BCALM2": "BCALM2", 
                   "GATBMINIAPIPELINE": "GATBMiniaPipeline",
                   "METAHIPMER2": "MetaHipMer2",
                   "MINIA": "MINIA",
                   "MEGAHIT": "MEGAHIT", 
                   "METASPADES": "metaSPAdes", 
                   "UNICYCLER": "Unicycler", 
                   "SPADES": "SPAdes",
                    "SKESA": "SKESA",
                   "VELVETOPTIMIZER": "VelvetOptimizer",
                   "IDBA": "IDBA-UD"}

REFERENCE_TO_NAME = {"Bacillus_subtilis":"<i>Bacillus subtilis</i>",
                    "Enterococcus_faecalis":"<i>Enterococcus faecalis</i>",
                    "Escherichia_coli":"<i>Escherichia coli</i>",
                    "Escherichia_coli_plasmid": np.nan,
                    "Lactobacillus_fermentum": "<i>Lactobacillus fermentum</i>",
                    "Listeria_monocytogenes": "<i>Listeria monocytogenes</i>",
                    "Pseudomonas_aeruginosa": "<i>Pseudomonas aeruginosa</i>",
                    "Salmonella_enterica": "<i>Salmonella enterica</i>",
                    "Staphylococcus_aureus": "<i>Staphylococcus aureus</i>",
                    "Staphylococcus_aureus_plasmid1": np.nan,
                    "Staphylococcus_aureus_plasmid2": np.nan,
                    "Staphylococcus_aureus_plasmid3": np.nan}

skipped_assemblers = ['ABySS', 'BCALM2', 'MINIA', 'VelvetOptimiser', 'MetaHipMer2']

log_distributed = ['LHS', 'LNN', 'ERR2935805']
even_distribution = ['EMS', 'LNN', 'ERR2984773']

COLOURS = ['#5876c8', '#58AEC8', '#39B185', '#9CCB86', '#EEB479', '#E88471', '#a54765', '#a42a2a', '#835221']

## Load data

In [3]:
_files = glob.glob('../Results/*/*/results/*/stats/*_breadth_of_coverage_contigs.csv')

df_list = []
for f in _files:
    run = f.split('/')[-5]
    sample = f.split('/')[-3]
    assembler = f.split('/')[-1].split('_')[1]
    if assembler not in skipped_assemblers:
        df = pd.read_csv(f)
        df['run'] = run
        df['Sample'] = sample
        df['Assembler'] = assembler
        df_list.append(df)

df = pd.concat(df_list)
df['distribution'] = np.where(df['Sample'].isin(log_distributed), 'Log', 'Even')
df = df[df.distribution != 'Log']
df['Reference'] = df['Reference'].map(REFERENCE_TO_NAME)
df = df.dropna()
df

Unnamed: 0.1,Unnamed: 0,Reference,Breadth of Coverage,Contigs,run,Sample,Assembler,distribution
0,0,<i>Bacillus subtilis</i>,0.980668,30,run1,ENN,SKESA,Even
1,1,<i>Enterococcus faecalis</i>,0.983352,52,run1,ENN,SKESA,Even
3,3,<i>Escherichia coli</i>,0.957014,270,run1,ENN,SKESA,Even
4,4,<i>Lactobacillus fermentum</i>,0.931404,94,run1,ENN,SKESA,Even
5,5,<i>Listeria monocytogenes</i>,0.972489,24,run1,ENN,SKESA,Even
...,...,...,...,...,...,...,...,...
4,4,<i>Lactobacillus fermentum</i>,0.936384,95,run2,ERR2984773,metaSPAdes,Even
5,5,<i>Listeria monocytogenes</i>,0.976109,25,run2,ERR2984773,metaSPAdes,Even
6,6,<i>Pseudomonas aeruginosa</i>,0.989600,100,run2,ERR2984773,metaSPAdes,Even
7,7,<i>Salmonella enterica</i>,0.977899,145,run2,ERR2984773,metaSPAdes,Even


In [4]:
len(df["Reference"].unique())

8

In [5]:
min(df["Breadth of Coverage"])

0.9224167114095017

## Explore data

In [6]:
for assembler in df['Assembler'].unique():
    print('->'+assembler)
    for reference in df['Reference'].unique():
        print('  -'+reference)
        print(df[(df['Assembler']==assembler) & (df['Reference']==reference)].describe())

->SKESA
  -<i>Bacillus subtilis</i>
       Unnamed: 0  Breadth of Coverage    Contigs
count         9.0             9.000000   9.000000
mean          0.0             0.981254  30.000000
std           0.0             0.000448   3.464102
min           0.0             0.980668  26.000000
25%           0.0             0.980668  26.000000
50%           0.0             0.981446  30.000000
75%           0.0             0.981648  34.000000
max           0.0             0.981648  34.000000
  -<i>Enterococcus faecalis</i>
       Unnamed: 0  Breadth of Coverage    Contigs
count         9.0             9.000000   9.000000
mean          1.0             0.984697  54.666667
std           0.0             0.002226  14.000000
min           1.0             0.983078  40.000000
25%           1.0             0.983078  40.000000
50%           1.0             0.983352  52.000000
75%           1.0             0.987661  72.000000
max           1.0             0.987661  72.000000
  -<i>Escherichia coli</i>
     

max           1.0             0.991199  46.000000
  -<i>Escherichia coli</i>
       Unnamed: 0  Breadth of Coverage     Contigs
count         9.0             9.000000    9.000000
mean          3.0             0.970434  187.666667
std           0.0             0.001546    8.674676
min           3.0             0.968535  180.000000
25%           3.0             0.968539  180.000000
50%           3.0             0.970683  184.000000
75%           3.0             0.972081  199.000000
max           3.0             0.972081  199.000000
  -<i>Lactobacillus fermentum</i>
       Unnamed: 0  Breadth of Coverage  Contigs
count         9.0             9.000000      9.0
mean          4.0             0.942588     87.0
std           0.0             0.001488      3.0
min           4.0             0.940607     83.0
25%           4.0             0.940710     83.0
50%           4.0             0.943223     89.0
75%           4.0             0.943839     89.0
max           4.0             0.943938     89.

       Unnamed: 0  Breadth of Coverage    Contigs
count         9.0             9.000000   9.000000
mean          6.0             0.989541  69.666667
std           0.0             0.000845   7.211103
min           6.0             0.988865  63.000000
25%           6.0             0.988865  63.000000
50%           6.0             0.989098  67.000000
75%           6.0             0.990660  79.000000
max           6.0             0.990660  79.000000
  -<i>Salmonella enterica</i>
       Unnamed: 0  Breadth of Coverage     Contigs
count         9.0             9.000000    9.000000
mean          7.0             0.975659   87.000000
std           0.0             0.004888   21.857493
min           7.0             0.969381   60.000000
25%           7.0             0.969381   60.000000
50%           7.0             0.977280   91.000000
75%           7.0             0.980315  110.000000
max           7.0             0.980315  110.000000
  -<i>Staphylococcus aureus</i>
       Unnamed: 0  Breadth of

## Plot data

### All Samples

In [12]:
fig=make_subplots(rows=4, cols=2, subplot_titles=[x.replace('_', ' ') for x in sorted(df['Reference'].unique(), key=lambda v: v.upper(), reverse=False)],
                 shared_yaxes=True, shared_xaxes=True, x_title="<b>Contigs", y_title="<b>Breadth of Coverage")
row=1
col=1

shape_map = {"ENN": "circle",
             "EMS": "square",
             "ERR2984773": "diamond"}
for reference in sorted(df['Reference'].unique()):
    print('--' + reference)
    i=0
    print(row, col)
    for assembler in sorted(df['Assembler'].unique(), key=lambda v: v.upper(), reverse=True):
        print('---' + assembler)
        showlegend=True if col==1 and row==1 else False
        
        contigs = list(df['Contigs'][(df['Assembler'] == assembler) & (df['Reference'] == reference) & (df['distribution'] == 'Even')])
        boc = list(df['Breadth of Coverage'][(df['Assembler'] == assembler) & (df['Reference'] == reference) & (df['distribution'] == 'Even')])
        sample = list(df['Sample'][(df['Assembler'] == assembler) & (df['Reference'] == reference) & (df['distribution'] == 'Even')])
        run = list(df['run'][(df['Assembler'] == assembler) & (df['Reference'] == reference) & (df['distribution'] == 'Even')])
        if contigs: #list not empty
            for x, y, z, r in zip(contigs, boc, sample, run):
                print(x, y, z, r)
                fig.add_trace(go.Scatter(x=[x],
                                         y=[y],
                                         name=assembler, marker=dict(color=COLOURS[i], size=12, symbol=shape_map[z],opacity=0.5,
                                                                     line=dict(color='black',width=1)), 
                                         showlegend=False, opacity=0.6),
                              col=col, row=row)
        i+=1
    if row == 4:
        row = 1
        col += 1
    else:
        row += 1

fig.update_layout(plot_bgcolor='rgb(255,255,255)')
fig.update_layout(title=dict(text='Genome fragmentation variation <br><sup>ZymoBIOMICS Microbial Community Standard bacterial reference replicons</sup><br>',
                             x=0.5,
                             y=0.98,
                             xanchor='center',
                             yanchor='top'),
                 font=dict(size=18))
fig.update_xaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', rangemode='tozero', type='log', range=[0,3])
fig.update_yaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', range=[0.9, 1])

for i in fig['layout']['annotations']:
    i['font']['size'] = 18

fig.update_layout(xaxis_showticklabels=True, xaxis2_showticklabels=True, xaxis3_showticklabels=True, xaxis4_showticklabels=True, xaxis5_showticklabels=True,  xaxis6_showticklabels=True,  xaxis7_showticklabels=True,  xaxis8_showticklabels=True)
fig.update_layout(yaxis_showticklabels=True, yaxis2_showticklabels=True, yaxis3_showticklabels=True, yaxis4_showticklabels=True, yaxis5_showticklabels=True,  yaxis6_showticklabels=True,  yaxis7_showticklabels=True,  yaxis8_showticklabels=True)


# just for display purpose, create traces so that legend contains colors.  does not connect graph

legend_trace = [go.Bar(name=assembler, x=[fig.data[0].x[0]], marker_color=c, showlegend=True,  legendgroup="Assemblers", legendgrouptitle_text="Assembler")
        for assembler,c in zip(sorted(df['Assembler'].unique(), key=lambda v: v.upper(), reverse=True),COLOURS)]
    
legend_trace = legend_trace + [go.Scatter(name=sample, x=[fig.data[0].x[0]], mode="markers", marker=dict(symbol=c, color="black", size=12), showlegend=True, legendgroup="Samples", legendgrouptitle_text="Sample")
                                for sample,c in shape_map.items()]
    
fig.update_traces(showlegend=False).add_traces(legend_trace)

fig.show()
plot(fig, filename='Plots/Genome Fragmentation/Figure 6 - Genome fragmentation for even samples.html', auto_open=False)

--<i>Bacillus subtilis</i>
1 1
---Unicycler
22 0.982219786700718 ENN run1
20 0.9827551729908244 EMS run1
18 0.982525050813498 ERR2984773 run1
22 0.982219786700718 ENN run3
20 0.9827551729908244 EMS run3
18 0.982525050813498 ERR2984773 run3
22 0.982219786700718 ENN run2
20 0.9827551729908244 EMS run2
18 0.982525050813498 ERR2984773 run2
---SPAdes
22 0.984425597990151 ENN run1
22 0.9840948746031876 EMS run1
33 0.982951184684294 ERR2984773 run1
22 0.984425597990151 ENN run3
22 0.9840948746031876 EMS run3
33 0.982951184684294 ERR2984773 run3
22 0.984425597990151 ENN run2
22 0.9840948746031876 EMS run2
33 0.982951184684294 ERR2984773 run2
---SKESA
30 0.9806675125077954 ENN run1
26 0.9816483125074 EMS run1
34 0.98144587420103 ERR2984773 run1
30 0.9806675125077954 ENN run3
26 0.9816483125074 EMS run3
34 0.98144587420103 ERR2984773 run3
30 0.9806675125077954 ENN run2
26 0.9816483125074 EMS run2
34 0.98144587420103 ERR2984773 run2
---metaSPAdes
18 0.9857267399251102 ENN run1
23 0.98405631492578

20 0.9894403781385952 EMS run3
19 0.9891523094619532 ERR2984773 run3
19 0.9777234687746252 ENN run2
19 0.9776248837866794 EMS run2
19 0.9891523094619532 ERR2984773 run2
---IDBA-UD
26 0.9866372226169337 ENN run1
25 0.9747298270050684 EMS run1
171 0.9743084179548996 ERR2984773 run1
26 0.9866372226169337 ENN run3
25 0.9747298270050684 EMS run3
171 0.9743084179548996 ERR2984773 run3
26 0.9866372226169337 ENN run2
25 0.9747298270050684 EMS run2
171 0.9743084179548996 ERR2984773 run2
---GATBMiniaPipeline
20 0.9862442194107492 ENN run1
20 0.987434257180496 EMS run1
24 0.988414425891158 ERR2984773 run1
19 0.9743575433556726 ENN run3
19 0.9755896886117964 EMS run3
23 0.9765511428840687 ERR2984773 run3
20 0.9862442194107492 ENN run2
19 0.9755743160374049 EMS run2
24 0.988414425891158 ERR2984773 run2
--<i>Pseudomonas aeruginosa</i>
2 2
---Unicycler
77 0.9870233042269736 ENN run1
60 0.9898850615326404 EMS run1
65 0.9884684636936072 ERR2984773 run1
77 0.9870233042269736 ENN run3
60 0.98988506153264

'Plots/Genome Fragmentation/Figure 6 - Genome fragmentation for even samples.html'