# Assembler rubustness - contigs

## Imports

In [25]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np

## Global variables

In [26]:
ASSEMBLER_PROCESS_LIST = ["ABYSS", "BCALM2", "GATBMINIAPIPELINE", "MINIA", "MEGAHIT", "METASPADES", "UNICYCLER", "SPADES",
                          "SKESA", "VELVETOPTIMIZER", "IDBA", "METAHIPMER2"]
PROCESS_TO_NAME = {"ABYSS": "ABySS",
                   "BCALM2": "BCALM2", 
                   "GATBMINIAPIPELINE": "GATBMiniaPipeline",
                   "MINIA": "MINIA",
                   "MEGAHIT": "MEGAHIT",
                   "METAHIPMER2": "MetaHipMer2",
                   "METASPADES": "metaSPAdes", 
                   "UNICYCLER": "Unicycler", 
                   "SPADES": "SPAdes",
                    "SKESA": "SKESA",
                   "VELVETOPTIMIZER": "VelvetOptimizer",
                   "IDBA": "IDBA-UD"}
log_distributed = ['LHS', 'LNN', 'ERR2935805']
even_distribution = ['EMS', 'LNN', 'ERR2984773']


## Load Data

In [27]:
contig_size_files = glob.glob('../Results/*/*/results/*/stats/*_df.csv')

df_list = []
for f in contig_size_files:
    run = f.split('/')[-5]
    df = pd.read_csv(f)
    df['run'] = run
    df_list.append(df)

contig_size_df = pd.concat(df_list)
contig_size_df['distribution'] = np.where(contig_size_df['Sample'].isin(log_distributed), 'Log', 'Even')

contig_size_df

Unnamed: 0.1,Unnamed: 0,index,Sample,Assembler,Contig,Contig Len,Mapped,#N,run,distribution
0,0,0,LNN,MEGAHIT,k141_1,1297,Bacillus_subtilis,0,run3,Log
1,1,1,LNN,MEGAHIT,k141_945,1057,Pseudomonas_aeruginosa,0,run3,Log
2,2,2,LNN,MEGAHIT,k141_477,1599,Pseudomonas_aeruginosa,0,run3,Log
3,3,3,LNN,MEGAHIT,k141_45,4626,Listeria_monocytogenes,0,run3,Log
4,4,4,LNN,MEGAHIT,k141_49,1160,Escherichia_coli,0,run3,Log
...,...,...,...,...,...,...,...,...,...,...
758,758,758,ERR2984773,GATBMiniaPipeline,5392,215478,Pseudomonas_aeruginosa,0,run2,Even
759,759,759,ERR2984773,GATBMiniaPipeline,5393,283087,Salmonella_enterica,0,run2,Even
760,760,760,ERR2984773,GATBMiniaPipeline,5394,263420,Bacillus_subtilis,0,run2,Even
761,761,761,ERR2984773,GATBMiniaPipeline,5395,193260,Enterococcus_faecalis,0,run2,Even


## Filter unique contigs

In [17]:
df_all_runs = contig_size_df.groupby('Contig Len').filter(lambda x: len(x) == 3)
df_all_runs

Unnamed: 0.1,Unnamed: 0,index,Sample,Assembler,Contig,Contig Len,Mapped,#N,run,distribution
3,3,3,LNN,MEGAHIT,k141_45,4626,Listeria_monocytogenes,0,run3,Log
8,8,8,LNN,MEGAHIT,k141_80,30016,Bacillus_subtilis,0,run3,Log
11,11,11,LNN,MEGAHIT,k141_1392,83817,Bacillus_subtilis,0,run3,Log
14,14,14,LNN,MEGAHIT,k141_1165,104155,Bacillus_subtilis,0,run3,Log
20,20,20,LNN,MEGAHIT,k141_82,122571,Bacillus_subtilis,0,run3,Log
...,...,...,...,...,...,...,...,...,...,...
758,758,758,ERR2984773,GATBMiniaPipeline,5392,215478,Pseudomonas_aeruginosa,0,run2,Even
759,759,759,ERR2984773,GATBMiniaPipeline,5393,283087,Salmonella_enterica,0,run2,Even
760,760,760,ERR2984773,GATBMiniaPipeline,5394,263420,Bacillus_subtilis,0,run2,Even
761,761,761,ERR2984773,GATBMiniaPipeline,5395,193260,Enterococcus_faecalis,0,run2,Even


### Contigs present in 2 runs

In [18]:
df_2runs = contig_size_df.groupby('Contig Len').filter(lambda x: len(x) == 2)
df_2runs

Unnamed: 0.1,Unnamed: 0,index,Sample,Assembler,Contig,Contig Len,Mapped,#N,run,distribution
10,10,10,LNN,VelvetOptimiser,NODE_106_length_295194_cov_15.789176,295224,Bacillus_subtilis,228,run3,Log
11,11,11,LNN,VelvetOptimiser,NODE_106_length_295194_cov_15.789176,295224,Bacillus_subtilis,228,run3,Log
17,17,17,LNN,VelvetOptimiser,NODE_165_length_169344_cov_15.595870,169374,Bacillus_subtilis,176,run3,Log
18,18,18,LNN,VelvetOptimiser,NODE_165_length_169344_cov_15.595870,169374,Bacillus_subtilis,176,run3,Log
26,26,26,LNN,VelvetOptimiser,NODE_438_length_98582_cov_15.263172,98612,Bacillus_subtilis,523,run3,Log
...,...,...,...,...,...,...,...,...,...,...
391,391,391,ERR2984773,MINIA,458,13742,Lactobacillus_fermentum,0,run2,Even
590,590,590,ERR2984773,MINIA,746,20387,Pseudomonas_aeruginosa,0,run2,Even
636,636,636,ERR2984773,MINIA,805,24773,Staphylococcus_aureus,0,run2,Even
826,826,826,ERR2984773,MINIA,1043,5137,Escherichia_coli,0,run2,Even


### Contigs present in only 1 run

In [19]:
df_1run = contig_size_df.groupby('Contig Len').filter(lambda x: len(x) == 1)
df_1run

Unnamed: 0.1,Unnamed: 0,index,Sample,Assembler,Contig,Contig Len,Mapped,#N,run,distribution
886,886,886,LNN,ABySS,93243,256771,Listeria_monocytogenes,0,run3,Log
1188,1188,1188,LNN,ABySS,93556,730778,Listeria_monocytogenes,0,run3,Log
1,1,1,LNN,VelvetOptimiser,NODE_15_length_23166_cov_16.622982,23196,Bacillus_subtilis,0,run3,Log
2,2,2,LNN,VelvetOptimiser,NODE_17_length_210216_cov_2883.458252,210246,Listeria_monocytogenes,0,run3,Log
3,3,3,LNN,VelvetOptimiser,NODE_18_length_104398_cov_15.914673,104428,Bacillus_subtilis,106,run3,Log
...,...,...,...,...,...,...,...,...,...,...
710,710,710,ERR2984773,MINIA,892,28107,Enterococcus_faecalis,0,run2,Even
863,863,863,ERR2984773,MINIA,1090,12227,Pseudomonas_aeruginosa,0,run2,Even
1090,1090,1090,ERR2984773,MINIA,1376,19444,Enterococcus_faecalis,0,run2,Even
1445,1445,1445,ERR2984773,MINIA,1990,11589,Pseudomonas_aeruginosa,0,run2,Even


## Plot data

In [20]:
#Just the different contigs
fig_contigs = go.Figure()
for assembler in sorted(contig_size_df['Assembler'].unique(), key=lambda v: v.upper(), reverse=True):
    showlegend = True if assembler == "MINIA" else False
    #all
    contigs_df_all=df_all_runs[df_all_runs['Assembler'] == assembler]
    if not contigs_df_all.empty:
        coord_list = list(contigs_df_all['Contig Len'])
        fig_contigs.add_trace(go.Box(x=coord_list,  marker_color="gray",name=assembler,showlegend=False))

    #n=1
    contigs_df= df_1run[df_1run['Assembler'] == assembler]
    if not contigs_df.empty:
        coord_list = list(contigs_df['Contig Len'])
        fig_contigs.add_trace(go.Scattergl(x=coord_list,
                                               y=[assembler]*len(coord_list),
                                               mode='markers',
                                               opacity=0.5,
                                               marker_symbol='line-ns',
                                               marker=dict(color='lightseagreen', size=12,
                                                           line=dict(width=5,
                                                                     color='#930001')),
                                               name="Present in 1 run",
                                               showlegend=showlegend))
    #n=2
    contigs_df_2= df_2runs[df_2runs['Assembler'] == assembler]
    if not contigs_df_2.empty:
        coord_list = list(contigs_df_2['Contig Len'])
        fig_contigs.add_trace(go.Scattergl(x=coord_list,
                                               y=[assembler]*len(coord_list),
                                               mode='markers',
                                               opacity=0.5,
                                               marker_symbol='line-ns',
                                               marker=dict(color='orange', size=12,
                                                           line=dict(width=5,
                                                                     color='#009392')),
                                               name="Present in 2 runs",
                                               showlegend=showlegend))

fig_contigs.update_layout(plot_bgcolor='rgb(255,255,255)', title="Inconsistent contigs per assembler over 3 LMAS runs")
fig_contigs.update_xaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', title="Contig Size in Basepairs")
fig_contigs.show()
plot(fig_contigs,auto_open=False, filename="Plots/Contig Size/Unique contigs in 3 runs")


Your filename `Plots/Contig Size/Unique contigs in 3 runs` didn't end with .html. Adding .html to the end of your file.



'Plots/Contig Size/Unique contigs in 3 runs.html'

In [8]:
((df_2runs[df_2runs['Assembler']=='GATBMiniaPipeline'].size + df_1run[df_1run['Assembler']=='GATBMiniaPipeline'].size)/contig_size_df[contig_size_df['Assembler']=='GATBMiniaPipeline'].size)*100

0.3221873911529084

In [9]:
((df_2runs[df_2runs['Assembler']=='IDBA-UD'].size + df_1run[df_1run['Assembler']=='IDBA-UD'].size)/contig_size_df[contig_size_df['Assembler']=='IDBA-UD'].size)*100

0.08339651250947687

In [10]:
((df_2runs[df_2runs['Assembler']=='MINIA'].size + df_1run[df_1run['Assembler']=='MINIA'].size)/contig_size_df[contig_size_df['Assembler']=='MINIA'].size)*100

0.14367507987905562

In [13]:
((df_2runs[df_2runs['Assembler']=='VelvetOptimiser'].size + df_1run[df_1run['Assembler']=='VelvetOptimiser'].size)/contig_size_df[contig_size_df['Assembler']=='VelvetOptimiser'].size)*100

1.6409236411101098

In [21]:
((df_2runs[df_2runs['Assembler']=='ABySS'].size + df_1run[df_1run['Assembler']=='ABySS'].size)/contig_size_df[contig_size_df['Assembler']=='ABySS'].size)*100

9.22695035460993

In [22]:
for assembler in contig_size_df['Assembler'].unique():
    print(assembler, contig_size_df[contig_size_df['Assembler']==assembler].size)

MEGAHIT 182190
ABySS 141000
MINIA 466330
metaSPAdes 184470
SPAdes 307710
Unicycler 51240
BCALM2 333870
IDBA-UD 263800
SKESA 235530
GATBMiniaPipeline 114840
VelvetOptimiser 321770
MetaHipMer2 233650


In [23]:
for assembler in contig_size_df['Assembler'].unique():
    print(assembler, df_2runs[df_2runs['Assembler']==assembler].size)

MEGAHIT 20
ABySS 10900
MINIA 410
metaSPAdes 0
SPAdes 0
Unicycler 0
BCALM2 0
IDBA-UD 160
SKESA 0
GATBMiniaPipeline 250
VelvetOptimiser 1100
MetaHipMer2 160


In [24]:
for assembler in contig_size_df['Assembler'].unique():
    print(assembler, df_1run[df_1run['Assembler']==assembler].size)

MEGAHIT 10
ABySS 2110
MINIA 260
metaSPAdes 0
SPAdes 0
Unicycler 0
BCALM2 0
IDBA-UD 60
SKESA 0
GATBMiniaPipeline 120
VelvetOptimiser 4180
MetaHipMer2 100


In [None]:
# COMMENTS - HOW TO ACCOUNT FOR FAILED RUNS