# LMAS Global Assembly Success Metrics Analysis

The success of an assembly is evaluated by the computation of metrics in two defined ways: globally through statistics inherent to the complete set of sequences that were assembled, and relative to the replicons present in the sample. 

The computation of the global metrics is done through custom python code from the complete set of sequences assembled. 

The following metrics are computed for the complete and filtered set of assembled sequences, restricted to contigs of length above a specified minimum size: 

- **Contig sizes**
    - **Contigs:** The total number of contigs in the assembly;
    - **Basepairs:** The total number of bases in the assembly;
    - **Maximum sequence length:** The length of the largest contig in the assembly.
    - **Number of ‘N’s:** Number of uncalled bases (N's) 
- **Contiguity**
    - **Nx (where 0  < x  ⩽ 100):** Length for which the collection of all assembled sequences of that length or longer in an assembly covers at least a given percentage of the total length of the assembly
- **Misassembly**
    - **Misassemblies** - Number of aligned contigs that contain a misassembly event


## Imports

In [4]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np

## Global variables

In [5]:
METRICS_COLUMNS_GLOBAL = ['run','sample','assembler', 'contigs', 'basepairs', 'max_contig', 'Ns', 'n50', 'misassemblies', 'mapped_reads']
ASSEMBLER_PROCESS_LIST = ["BCALM2", "GATBMINIAPIPELINE", "MINIA", "MEGAHIT", "METASPADES", "UNICYCLER", "SPADES",
                          "SKESA", "VELVETOPTIMIZER", "IDBA"]
PROCESS_TO_NAME = {"BCALM2": "BCALM2", 
                   "GATBMINIAPIPELINE": "GATBMiniaPipeline",
                   "MINIA": "MINIA",
                   "MEGAHIT": "MEGAHIT", 
                   "METASPADES": "metaSPAdes", 
                   "UNICYCLER": "Unicycler", 
                   "SPADES": "SPAdes",
                    "SKESA": "SKESA",
                   "VELVETOPTIMIZER": "VelvetOptimizer",
                   "IDBA": "IDBA-UD"}

## Load Data

### Create Dataframe from Table Data

In [7]:
report_glob = glob.glob('../Results/*/report/pipeline_report_tables.json')
global_pipeline_metrics_df = pd.DataFrame(columns=METRICS_COLUMNS_GLOBAL)

for pipeline_report_file in report_glob:
    report_file_name = pipeline_report_file.split('/')[-1]
    stats_run = pipeline_report_file.split('/')[-3]
    print('Processing {0} data from {1}...'.format(report_file_name, stats_run))
    
    with open(pipeline_report_file) as _fh:
        json_report = json.load(_fh)
        for sample in json_report.keys():
            for line in json_report[sample]['GlobalTable']:
                assembler = line['assembler']
                global_pipeline_metrics_df = global_pipeline_metrics_df.append({'run': stats_run,
                                                                                'sample': sample,
                                                                                'assembler': line['assembler'],
                                                                                'contigs': line['original']['contigs'],
                                                                                'basepairs': line['original']['basepairs'],
                                                                                'max_contig': line['original']['max_contig_size'],
                                                                                'n50': line['original']['N50'],
                                                                                'mapped_reads': line['original']['mapped_reads'],
                                                                                'Ns': line['original']['Ns'],
                                                                                'misassemblies': line['filtered']['misassembled_contigs']},
                                                                               ignore_index=True)


Processing pipeline_report_tables.json data from run1...


### Classify Data - Log and Even distribution

In [8]:
log_distributed = ['LHS', 'LNN', 'ERR2935805']
even_distribution = ['EHS', 'LNN', 'ERR2984773']

global_pipeline_metrics_df['distribution'] = np.where(global_pipeline_metrics_df['sample'].isin(log_distributed), 'Log', 'Even')
display(global_pipeline_metrics_df)

Unnamed: 0,run,sample,assembler,contigs,basepairs,max_contig,Ns,n50,misassemblies,mapped_reads,distribution
0,run1,EHS,BCALM2,395209,45781136,16662,0,593,0,55.799825,Even
1,run1,EHS,GATBMiniaPipeline,456,30458023,1303074,40477,277593,45,96.583665,Even
2,run1,EHS,IDBA-UD,1808,30395802,585610,0,61823,14,76.598401,Even
3,run1,EHS,MEGAHIT,1076,30628130,768389,0,173935,10,97.558251,Even
4,run1,EHS,metaSPAdes,819,30472093,1032828,0,193712,1,97.046228,Even
5,run1,EHS,MINIA,17451,30676700,161812,0,18179,0,79.521533,Even
6,run1,EHS,SKESA,2009,30081658,474592,0,44029,8,91.59523,Even
7,run1,EHS,SPAdes,7786,30958558,847495,0,196423,2,86.864229,Even
8,run1,EHS,Unicycler,705,30388464,882570,0,242222,1,91.164218,Even
9,run1,EHS,VelvetOptimizer,15622,31184863,846904,25898,27836,25,71.201116,Even


## Plot Statistics

In [19]:
fig_global = make_subplots(rows=4, cols=2, shared_xaxes=True, x_title="Assembler", 
                           subplot_titles=('Contigs', 'Basepairs', 'Ns', 'misassemblies','Largest Contig', 'Mapped Reads', 'N50'),
                           specs=[[{}, {}],[{}, {}],[{},{'rowspan': 2}],[{}, None]])
row_coord = 1
for column in ['contigs', 'basepairs', 'max_contig', 'n50']:
    showlegend = True if row_coord == 1 else False
    #log
    fig_global.add_trace(go.Violin(y=global_pipeline_metrics_df[column][global_pipeline_metrics_df['distribution'] == 'Log'],
                                   x=global_pipeline_metrics_df['assembler'][global_pipeline_metrics_df['distribution'] == 'Log'],
                                   legendgroup='Log', scalegroup='Log', name='Log', box_visible=True, line_color='black',
                                   meanline_visible=True, fillcolor='lightseagreen', opacity=0.6, side='negative',
                                   showlegend=showlegend, spanmode='hard'),
                         row=row_coord, col=1)
    #even
    fig_global.add_trace(go.Violin(y=global_pipeline_metrics_df[column][global_pipeline_metrics_df['distribution'] == 'Even'],
                               x=global_pipeline_metrics_df['assembler'][global_pipeline_metrics_df['distribution'] == 'Even'],
                               legendgroup='Even', scalegroup='Even', name='Even', box_visible=True, line_color='black',
                               meanline_visible=True, fillcolor='orange', opacity=0.6, side='positive',
                               showlegend=showlegend, spanmode='hard'),
                     row=row_coord, col=1)
    #mean
    fig_global.add_trace(go.Scatter(y=[global_pipeline_metrics_df[column].mean()]*len(global_pipeline_metrics_df['assembler']), 
                                    x=global_pipeline_metrics_df['assembler'], mode='lines', name='mean',
                                    line=dict(color="crimson"), opacity=0.6, showlegend=showlegend),
                         row=row_coord, col=1)

    row_coord += 1

row_coord = 1
for column in ['Ns', 'misassemblies']:
    #log
    fig_global.add_trace(go.Violin(y=global_pipeline_metrics_df[column][global_pipeline_metrics_df['distribution'] == 'Log'],
                                   x=global_pipeline_metrics_df['assembler'][global_pipeline_metrics_df['distribution'] == 'Log'],
                                   legendgroup='Log', scalegroup='Log', name='Log', box_visible=True, line_color='black',
                                   meanline_visible=True, fillcolor='lightseagreen', opacity=0.6, side='negative',
                                   showlegend=False, spanmode='hard'),
                         row=row_coord, col=2)
    #even
    fig_global.add_trace(go.Violin(y=global_pipeline_metrics_df[column][global_pipeline_metrics_df['distribution'] == 'Even'],
                               x=global_pipeline_metrics_df['assembler'][global_pipeline_metrics_df['distribution'] == 'Even'],
                               legendgroup='Even', scalegroup='Even', name='Even', box_visible=True, line_color='black',
                               meanline_visible=True, fillcolor='orange', opacity=0.6, side='positive',
                               showlegend=False, spanmode='hard'),
                     row=row_coord, col=2)
    #mean
    fig_global.add_trace(go.Scatter(y=[global_pipeline_metrics_df[column].mean()]*len(global_pipeline_metrics_df['assembler']), 
                                    x=global_pipeline_metrics_df['assembler'], mode='lines', name='mean',
                                    line=dict(color="crimson"), opacity=0.6, showlegend=False),
                         row=row_coord, col=2)
    row_coord += 1

fig_global.add_trace(go.Violin(y=global_pipeline_metrics_df['mapped_reads'][global_pipeline_metrics_df['distribution'] == 'Log'], 
                                     x=global_pipeline_metrics_df['assembler'][global_pipeline_metrics_df['distribution'] == 'Log'],
                                     box_visible=True, line_color='black',legendgroup='Log', scalegroup='Log', name='Log',
                                     meanline_visible=True, fillcolor='lightseagreen', opacity=0.6, side='negative',
                                     showlegend=False, spanmode='hard'), row=3, col=2)
fig_global.add_trace(go.Violin(y=global_pipeline_metrics_df['mapped_reads'][global_pipeline_metrics_df['distribution'] == 'Even'], 
                                     x=global_pipeline_metrics_df['assembler'][global_pipeline_metrics_df['distribution'] == 'Even'],
                                     box_visible=True, line_color='black',legendgroup='Even', scalegroup='Even', name='Even',
                                     meanline_visible=True, fillcolor='orange', opacity=0.6, side='positive',
                                     showlegend=False, spanmode='hard'), row=3, col=2)
fig_global.add_trace(go.Scatter(y=[global_pipeline_metrics_df['mapped_reads'].mean()]*len(global_pipeline_metrics_df['assembler']), 
                                     x=global_pipeline_metrics_df['assembler'], mode='lines',name='mean',
                                     line=dict(color="crimson"), opacity=0.6, showlegend=False), row=3, col=2)


fig_global.update_layout(plot_bgcolor='rgb(255,255,255)', title_text="Global Assembly Success Metrics")
# grid
fig_global['layout']['xaxis']['gridcolor']='#DCDCDC'
fig_global['layout']['xaxis2']['gridcolor']='#DCDCDC'
fig_global['layout']['xaxis3']['gridcolor']='#DCDCDC'
fig_global['layout']['xaxis4']['gridcolor']='#DCDCDC'
# y-axis legends
fig_global['layout']['yaxis']['title']='#'
fig_global['layout']['yaxis2']['title']='#'
fig_global['layout']['yaxis3']['title']='Basepairs'
fig_global['layout']['yaxis4']['title']='Basepairs'

fig_global.update_layout(violingap=0, violinmode='overlay')

fig_global.show()

In [21]:
plot(fig_global, filename='Global Assembly Success Metrics.html', auto_open=False)

'Global Assembly Success Metrics.html'