# Assembler rubustness - contigs

## Imports

In [1]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np

## Global variables

In [2]:
METRICS_COLUMNS_GLOBAL = ['run','sample','assembler', 'contigs', 'basepairs', 'max_contig', 'Ns', 'n50', 'misassemblies', 'mapped_reads']
ASSEMBLER_PROCESS_LIST = ["BCALM2", "GATBMINIAPIPELINE", "MINIA", "MEGAHIT", "METASPADES", "UNICYCLER", "SPADES",
                          "SKESA", "VELVETOPTIMIZER", "IDBA"]
PROCESS_TO_NAME = {"BCALM2": "BCALM2", 
                   "GATBMINIAPIPELINE": "GATBMiniaPipeline",
                   "MINIA": "MINIA",
                   "MEGAHIT": "MEGAHIT", 
                   "METASPADES": "metaSPAdes", 
                   "UNICYCLER": "Unicycler", 
                   "SPADES": "SPAdes",
                    "SKESA": "SKESA",
                   "VELVETOPTIMIZER": "VelvetOptimizer",
                   "IDBA": "IDBA-UD"}
log_distributed = ['LHS', 'LNN', 'ERR2935805']
even_distribution = ['EHS', 'LNN', 'ERR2984773']


## Load Data

In [7]:
contig_size_files = glob.glob('../Results/*/results/*/stats/*_df.csv')

df_list = []
for f in contig_size_files:
    run = f.split('/')[-5]
    df = pd.read_csv(f)
    df['run'] = run
    df_list.append(df)

contig_size_df = pd.concat(df_list)
contig_size_df['distribution'] = np.where(contig_size_df['Sample'].isin(log_distributed), 'Log', 'Even')

contig_size_df

Unnamed: 0.1,Unnamed: 0,index,Sample,Assembler,Contig,Contig Len,Mapped,#N,run,distribution
0,0,0,LNN,BCALM2,4489,1087,Salmonella_enterica,0,run1,Log
1,1,1,LNN,BCALM2,4505,1461,Salmonella_enterica,0,run1,Log
2,2,2,LNN,BCALM2,4508,1467,Escherichia_coli,0,run1,Log
3,3,3,LNN,BCALM2,4534,1483,Salmonella_enterica,0,run1,Log
4,4,4,LNN,BCALM2,4549,1024,Pseudomonas_aeruginosa,0,run1,Log
...,...,...,...,...,...,...,...,...,...,...
1460,1460,1460,ERR2984773,MEGAHIT,k141_15489,1584,Unmapped,0,run2,Even
1461,1461,1461,ERR2984773,MEGAHIT,k141_15515,1097,Unmapped,0,run2,Even
1462,1462,1462,ERR2984773,MEGAHIT,k141_15533,32821,Unmapped,0,run2,Even
1463,1463,1463,ERR2984773,MEGAHIT,k141_15535,1533,Unmapped,0,run2,Even


## Filter unique contigs

### Contigs present in 2 runs

In [11]:
df_2runs = contig_size_df.groupby('Contig Len').filter(lambda x: len(x) == 2)
df_2runs

Unnamed: 0.1,Unnamed: 0,index,Sample,Assembler,Contig,Contig Len,Mapped,#N,run,distribution
46,46,46,ERR2935805,GATBMiniaPipeline,737,7965,Bacillus_subtilis,0,run1,Log
289,289,289,ERR2935805,GATBMiniaPipeline,3250,44007,Bacillus_subtilis,0,run1,Log
340,340,340,ERR2935805,GATBMiniaPipeline,3894,94504,Bacillus_subtilis,0,run1,Log
406,406,406,ERR2935805,GATBMiniaPipeline,4479,5327,Bacillus_subtilis,0,run1,Log
479,479,479,ERR2935805,GATBMiniaPipeline,5389,18494,Bacillus_subtilis,0,run1,Log
...,...,...,...,...,...,...,...,...,...,...
2567,2567,2567,ERR2984773,MINIA,3798,9618,Enterococcus_faecalis,0,run2,Even
3342,3342,3342,ERR2984773,MINIA,6045,6483,Enterococcus_faecalis,0,run2,Even
46,46,46,ERR2984773,IDBA-UD,contig-100_46,74178,Salmonella_enterica,0,run2,Even
196,196,196,ERR2984773,IDBA-UD,contig-100_194,35618,Bacillus_subtilis,0,run2,Even


### Contigs present in only 1 run

In [13]:
df_1run = contig_size_df.groupby('Contig Len').filter(lambda x: len(x) == 1)
df_1run

Unnamed: 0.1,Unnamed: 0,index,Sample,Assembler,Contig,Contig Len,Mapped,#N,run,distribution
59,59,59,ERR2935805,GATBMiniaPipeline,861,43789,Bacillus_subtilis,0,run1,Log
92,92,92,ERR2935805,GATBMiniaPipeline,1310,15099,Bacillus_subtilis,0,run1,Log
99,99,99,ERR2935805,GATBMiniaPipeline,1343,31281,Bacillus_subtilis,0,run1,Log
255,255,255,ERR2935805,MINIA,300,13924,Pseudomonas_aeruginosa,0,run1,Log
1238,1238,1238,ERR2935805,MINIA,4603,25278,Bacillus_subtilis,0,run1,Log
36,36,36,ERR2984773,MINIA,39,67015,Listeria_monocytogenes,0,run1,Even
90,90,90,ERR2984773,MINIA,101,21653,Enterococcus_faecalis,0,run1,Even
315,315,315,ERR2984773,MINIA,368,28107,Enterococcus_faecalis,0,run1,Even
347,347,347,ERR2984773,MINIA,404,14733,Pseudomonas_aeruginosa,0,run1,Even
846,846,846,ERR2984773,MINIA,993,24800,Pseudomonas_aeruginosa,0,run1,Even


## Plot data

In [92]:
#Just the different contigs
fig_contigs = go.Figure()
for assembler in sorted(contig_size_df['Assembler'].unique(), key=lambda v: v.upper(), reverse=True):
    showlegend = True if assembler == "MINIA" else False
    #n=1
    contigs_df= df_1run[df_1run['Assembler'] == assembler]
    if not contigs_df.empty:
        coord_list = list(contigs_df['Contig Len'])
        fig_contigs.add_trace(go.Scattergl(x=coord_list,
                                               y=[assembler]*len(coord_list),
                                               mode='markers',
                                               opacity=0.7,
                                               marker_symbol='line-ns',
                                               marker=dict(color='lightseagreen', size=12,
                                                           line=dict(width=5,
                                                                     color='lightseagreen')),
                                               name="Present in 1 run",
                                               showlegend=showlegend))
    #n=2
    contigs_df_2= df_2runs[df_2runs['Assembler'] == assembler]
    if not contigs_df_2.empty:
        coord_list = list(contigs_df_2['Contig Len'])
        fig_contigs.add_trace(go.Scattergl(x=coord_list,
                                               y=[assembler]*len(coord_list),
                                               mode='markers',
                                               opacity=0.7,
                                               marker_symbol='line-ns',
                                               marker=dict(color='orange', size=12,
                                                           line=dict(width=5,
                                                                     color='orange')),
                                               name="Present in 2 runs",
                                               showlegend=showlegend))

fig_contigs.update_layout(plot_bgcolor='rgb(255,255,255)', title="Inconsistent contigs per assembler over 3 LMAS runs")
fig_contigs.update_xaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', title="Contig Size in Basepairs")
fig_contigs.show()
plot(fig_contigs,auto_open=False, filename="Plots/Contig Size/Unique contigs in 3 runs")

'Plots/Contig Size/Unique contigs in 3 runs.html'