# Assembler rubustness - contigs

## Imports

In [1]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np

## Global variables

In [2]:
METRICS_COLUMNS_GLOBAL = ['run','sample','assembler', 'contigs', 'basepairs', 'max_contig', 'Ns', 'n50', 'misassemblies', 'mapped_reads']
ASSEMBLER_PROCESS_LIST = ["BCALM2", "GATBMINIAPIPELINE", "MINIA", "MEGAHIT", "METASPADES", "UNICYCLER", "SPADES",
                          "SKESA", "VELVETOPTIMIZER", "IDBA"]
PROCESS_TO_NAME = {"BCALM2": "BCALM2", 
                   "GATBMINIAPIPELINE": "GATBMiniaPipeline",
                   "MINIA": "MINIA",
                   "MEGAHIT": "MEGAHIT", 
                   "METASPADES": "metaSPAdes", 
                   "UNICYCLER": "Unicycler", 
                   "SPADES": "SPAdes",
                    "SKESA": "SKESA",
                   "VELVETOPTIMIZER": "VelvetOptimizer",
                   "IDBA": "IDBA-UD"}
log_distributed = ['LHS', 'LNN', 'ERR2935805']
even_distribution = ['EHS', 'LNN', 'ERR2984773']


## Load Data

In [3]:
contig_size_files = glob.glob('../Results/*/results/*/stats/*_df.csv')

df_list = []
for f in contig_size_files:
    run = f.split('/')[-5]
    df = pd.read_csv(f)
    df['run'] = run
    df_list.append(df)

contig_size_df = pd.concat(df_list)
contig_size_df['distribution'] = np.where(contig_size_df['Sample'].isin(log_distributed), 'Log', 'Even')

contig_size_df

Unnamed: 0.1,Unnamed: 0,index,Sample,Assembler,Contig,Contig Len,Mapped,#N,run,distribution
0,0,0,LHS,metaSPAdes,NODE_1_length_780172_cov_40.767116,780172,Listeria_monocytogenes,0,run3,Log
1,1,1,LHS,metaSPAdes,NODE_2_length_768428_cov_88.189521,768428,Bacillus_subtilis,0,run3,Log
2,2,2,LHS,metaSPAdes,NODE_3_length_640048_cov_86.001177,640048,Pseudomonas_aeruginosa,0,run3,Log
3,3,3,LHS,metaSPAdes,NODE_4_length_596012_cov_186.952841,596012,Enterococcus_faecalis,0,run3,Log
4,4,4,LHS,metaSPAdes,NODE_5_length_551579_cov_88.357805,551579,Bacillus_subtilis,0,run3,Log
...,...,...,...,...,...,...,...,...,...,...
2864,2864,2864,EHS,VelvetOptimizer,NODE_45019_length_26613_cov_1259.098511,26643,Escherichia_coli_plasmid,0,run1,Even
2865,2865,2865,EHS,VelvetOptimizer,NODE_45020_length_24201_cov_1261.641724,24231,Escherichia_coli_plasmid,0,run1,Even
2866,2866,2866,EHS,VelvetOptimizer,NODE_45021_length_12186_cov_1257.072754,12216,Escherichia_coli_plasmid,0,run1,Even
2867,2867,2867,EHS,VelvetOptimizer,NODE_45023_length_3888_cov_1256.017456,3918,Escherichia_coli_plasmid,0,run1,Even


## Filter unique contigs

In [4]:
df_all_runs = contig_size_df.groupby('Contig Len').filter(lambda x: len(x) == 3)
df_all_runs

Unnamed: 0.1,Unnamed: 0,index,Sample,Assembler,Contig,Contig Len,Mapped,#N,run,distribution
0,0,0,LHS,metaSPAdes,NODE_1_length_780172_cov_40.767116,780172,Listeria_monocytogenes,0,run3,Log
1,1,1,LHS,metaSPAdes,NODE_2_length_768428_cov_88.189521,768428,Bacillus_subtilis,0,run3,Log
3,3,3,LHS,metaSPAdes,NODE_4_length_596012_cov_186.952841,596012,Enterococcus_faecalis,0,run3,Log
8,8,8,LHS,metaSPAdes,NODE_9_length_411441_cov_86.930921,411441,Pseudomonas_aeruginosa,0,run3,Log
11,11,11,LHS,metaSPAdes,NODE_12_length_327114_cov_40.278613,327114,Listeria_monocytogenes,0,run3,Log
...,...,...,...,...,...,...,...,...,...,...
2797,2797,2797,EHS,VelvetOptimizer,NODE_12266_length_33291_cov_50.407619,33321,Staphylococcus_aureus,0,run1,Even
2829,2829,2829,EHS,VelvetOptimizer,NODE_12448_length_5482_cov_49.887997,5512,Staphylococcus_aureus,0,run1,Even
2864,2864,2864,EHS,VelvetOptimizer,NODE_45019_length_26613_cov_1259.098511,26643,Escherichia_coli_plasmid,0,run1,Even
2865,2865,2865,EHS,VelvetOptimizer,NODE_45020_length_24201_cov_1261.641724,24231,Escherichia_coli_plasmid,0,run1,Even


### Contigs present in 2 runs

In [5]:
df_2runs = contig_size_df.groupby('Contig Len').filter(lambda x: len(x) == 2)
df_2runs

Unnamed: 0.1,Unnamed: 0,index,Sample,Assembler,Contig,Contig Len,Mapped,#N,run,distribution
455,455,455,LHS,MINIA,581,21528,Enterococcus_faecalis,0,run3,Log
1171,1171,1171,ERR2935805,MINIA,2673,14315,Bacillus_subtilis,0,run3,Log
135,135,135,ERR2935805,GATBMiniaPipeline,1722,26925,Bacillus_subtilis,0,run3,Log
352,352,352,ERR2935805,GATBMiniaPipeline,4145,30855,Bacillus_subtilis,0,run3,Log
520,520,520,ERR2935805,GATBMiniaPipeline,5853,94504,Bacillus_subtilis,0,run3,Log
...,...,...,...,...,...,...,...,...,...,...
923,923,923,ERR2984773,MINIA,1096,40316,Pseudomonas_aeruginosa,0,run1,Even
994,994,994,ERR2984773,MINIA,1196,12227,Pseudomonas_aeruginosa,0,run1,Even
1122,1122,1122,ERR2984773,MINIA,1381,50041,Pseudomonas_aeruginosa,0,run1,Even
1242,1242,1242,ERR2984773,MINIA,1543,11589,Pseudomonas_aeruginosa,0,run1,Even


### Contigs present in only 1 run

In [6]:
df_1run = contig_size_df.groupby('Contig Len').filter(lambda x: len(x) == 1)
df_1run

Unnamed: 0.1,Unnamed: 0,index,Sample,Assembler,Contig,Contig Len,Mapped,#N,run,distribution
11,11,11,ERR2935805,MINIA,14,13472,Pseudomonas_aeruginosa,0,run3,Log
1146,1146,1146,ERR2935805,MINIA,2289,23529,Bacillus_subtilis,0,run3,Log
1189,1189,1189,ERR2935805,MINIA,2901,39793,Bacillus_subtilis,0,run3,Log
111,111,111,ERR2935805,GATBMiniaPipeline,1386,32000,Bacillus_subtilis,0,run3,Log
141,141,141,ERR2935805,GATBMiniaPipeline,1807,15516,Bacillus_subtilis,0,run3,Log
238,238,238,ERR2935805,GATBMiniaPipeline,2674,12943,Bacillus_subtilis,0,run3,Log
316,316,316,ERR2935805,GATBMiniaPipeline,3726,12000,Bacillus_subtilis,0,run3,Log
602,602,602,ERR2935805,GATBMiniaPipeline,6775,92240,Bacillus_subtilis,0,run3,Log
712,712,712,ENN,IDBA-UD,contig-100_698,6879,Escherichia_coli,0,run3,Even
135,135,135,ERR2984773,IDBA-UD,contig-100_134,44837,Salmonella_enterica,0,run3,Even


## Plot data

In [18]:
#Just the different contigs
fig_contigs = go.Figure()
for assembler in sorted(contig_size_df['Assembler'].unique(), key=lambda v: v.upper(), reverse=True):
    showlegend = True if assembler == "MINIA" else False
    #all
    contigs_df_all=df_all_runs[df_all_runs['Assembler'] == assembler]
    if not contigs_df_all.empty:
        coord_list = list(contigs_df_all['Contig Len'])
        fig_contigs.add_trace(go.Box(x=coord_list,  marker_color="gray",name=assembler,showlegend=False))

    #n=1
    contigs_df= df_1run[df_1run['Assembler'] == assembler]
    if not contigs_df.empty:
        coord_list = list(contigs_df['Contig Len'])
        fig_contigs.add_trace(go.Scattergl(x=coord_list,
                                               y=[assembler]*len(coord_list),
                                               mode='markers',
                                               opacity=0.5,
                                               marker_symbol='line-ns',
                                               marker=dict(color='lightseagreen', size=12,
                                                           line=dict(width=5,
                                                                     color='#930001')),
                                               name="Present in 1 run",
                                               showlegend=showlegend))
    #n=2
    contigs_df_2= df_2runs[df_2runs['Assembler'] == assembler]
    if not contigs_df_2.empty:
        coord_list = list(contigs_df_2['Contig Len'])
        fig_contigs.add_trace(go.Scattergl(x=coord_list,
                                               y=[assembler]*len(coord_list),
                                               mode='markers',
                                               opacity=0.5,
                                               marker_symbol='line-ns',
                                               marker=dict(color='orange', size=12,
                                                           line=dict(width=5,
                                                                     color='#009392')),
                                               name="Present in 2 runs",
                                               showlegend=showlegend))

fig_contigs.update_layout(plot_bgcolor='rgb(255,255,255)', title="Inconsistent contigs per assembler over 3 LMAS runs")
fig_contigs.update_xaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', title="Contig Size in Basepairs")
fig_contigs.show()
plot(fig_contigs,auto_open=False, filename="Plots/Contig Size/Unique contigs in 3 runs")

'Plots/Contig Size/Unique contigs in 3 runs.html'

In [28]:
((df_2runs[df_2runs['Assembler']=='GATBMiniaPipeline'].size + df_1run[df_1run['Assembler']=='GATBMiniaPipeline'].size)/contig_size_df[contig_size_df['Assembler']=='GATBMiniaPipeline'].size)*100

0.22844958879074018

In [29]:
((df_2runs[df_2runs['Assembler']=='IDBA-UD'].size + df_1run[df_1run['Assembler']=='IDBA-UD'].size)/contig_size_df[contig_size_df['Assembler']=='IDBA-UD'].size)*100

0.045624898158709466

In [30]:
((df_2runs[df_2runs['Assembler']=='MINIA'].size + df_1run[df_1run['Assembler']=='MINIA'].size)/contig_size_df[contig_size_df['Assembler']=='MINIA'].size)*100

0.12375188594483719

In [32]:
for assembler in contig_size_df['Assembler'].unique():
    print(assembler, contig_size_df[contig_size_df['Assembler']==assembler].size)

metaSPAdes 196950
SPAdes 280770
SKESA 430980
MEGAHIT 197310
BCALM2 527880
GATBMiniaPipeline 131320
MINIA 589890
IDBA-UD 306850
Unicycler 60600
VelvetOptimizer 181210


In [33]:
for assembler in contig_size_df['Assembler'].unique():
    print(assembler, df_2runs[df_2runs['Assembler']==assembler].size)

metaSPAdes 0
SPAdes 0
SKESA 0
MEGAHIT 0
BCALM2 0
GATBMiniaPipeline 180
MINIA 500
IDBA-UD 80
Unicycler 0
VelvetOptimizer 0


In [34]:
for assembler in contig_size_df['Assembler'].unique():
    print(assembler, df_1run[df_1run['Assembler']==assembler].size)

metaSPAdes 0
SPAdes 0
SKESA 0
MEGAHIT 0
BCALM2 0
GATBMiniaPipeline 120
MINIA 230
IDBA-UD 60
Unicycler 0
VelvetOptimizer 0
