# Assembler rubustness - Table Metrics

In [17]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np

In [18]:
METRICS_COLUMNS_GLOBAL = ['run','sample','assembler', 'contigs', 'basepairs', 'max_contig', 'Ns', 'n50', 'misassemblies', 'mapped_reads']
ASSEMBLER_PROCESS_LIST = ["BCALM2", "GATBMINIAPIPELINE", "MINIA", "MEGAHIT", "METASPADES", "UNICYCLER", "SPADES",
                          "SKESA", "VELVETOPTIMIZER", "IDBA"]
PROCESS_TO_NAME = {"BCALM2": "BCALM2", 
                   "GATBMINIAPIPELINE": "GATBMiniaPipeline",
                   "MINIA": "MINIA",
                   "MEGAHIT": "MEGAHIT", 
                   "METASPADES": "metaSPAdes", 
                   "UNICYCLER": "Unicycler", 
                   "SPADES": "SPAdes",
                    "SKESA": "SKESA",
                   "VELVETOPTIMIZER": "VelvetOptimizer",
                   "IDBA": "IDBA-UD"}
log_distributed = ['LHS', 'LNN', 'ERR2935805']
even_distribution = ['EHS', 'LNN', 'ERR2984773']
METRICS_COLUMNS_REFERENCE = ['run','sample','assembler', 'reference', 'contiguity', 'breadth_of_coverage', 'multiplicity', 'validity', 
                             'parsimony', 'identity', 'lowest_identity', 'L90', 'contigs', 'NA50', 'NG50', 'basepairs', 'Ns',
                             'misassemblies']

## Global Metrics

In [19]:
report_glob = glob.glob('../Results/*/report/pipeline_report_tables.json')
global_pipeline_metrics_df = pd.DataFrame(columns=METRICS_COLUMNS_GLOBAL)

for pipeline_report_file in report_glob:
    report_file_name = pipeline_report_file.split('/')[-1]
    stats_run = pipeline_report_file.split('/')[-3]
    print('Processing {0} data from {1}...'.format(report_file_name, stats_run))
    
    with open(pipeline_report_file) as _fh:
        json_report = json.load(_fh)
        for sample in json_report.keys():
            for line in json_report[sample]['GlobalTable']:
                assembler = line['assembler']
                global_pipeline_metrics_df = global_pipeline_metrics_df.append({'run': stats_run,
                                                                                'sample': sample,
                                                                                'assembler': line['assembler'],
                                                                                'contigs': line['original']['contigs'],
                                                                                'basepairs': line['original']['basepairs'],
                                                                                'max_contig': line['original']['max_contig_size'],
                                                                                'n50': line['original']['N50'],
                                                                                'mapped_reads': line['original']['mapped_reads'],
                                                                                'Ns': line['original']['Ns'],
                                                                                'misassemblies': line['filtered']['misassembled_contigs']},
                                                                               ignore_index=True)
global_pipeline_metrics_df['distribution'] = np.where(global_pipeline_metrics_df['sample'].isin(log_distributed), 'Log', 'Even')
display(global_pipeline_metrics_df)

Processing pipeline_report_tables.json data from run1...
Processing pipeline_report_tables.json data from run3...
Processing pipeline_report_tables.json data from run2...


Unnamed: 0,run,sample,assembler,contigs,basepairs,max_contig,Ns,n50,misassemblies,mapped_reads,distribution
0,run1,ERR2984773,BCALM2,3508016,170402348,994,0,49,0,8.149584,Even
1,run1,ERR2984773,GATBMiniaPipeline,5402,32811505,768462,0,102115,2,83.144556,Even
2,run1,ERR2984773,IDBA-UD,20194,39779364,303329,0,20993,4,83.671518,Even
3,run1,ERR2984773,MEGAHIT,20614,41166407,1205839,0,88308,17,84.223697,Even
4,run1,ERR2984773,metaSPAdes,61637,48961158,473277,0,42332,2,84.325334,Even
...,...,...,...,...,...,...,...,...,...,...,...
175,run2,LHS,MINIA,25156,30955684,161812,0,17610,0,65.535796,Log
176,run2,LHS,SKESA,24071,26344064,35242,0,1960,1,82.984681,Log
177,run2,LHS,SPAdes,8910,31037639,847495,0,177625,2,86.861447,Log
178,run2,LHS,Unicycler,571,27485372,1301952,0,242300,5,86.915658,Log


In [20]:
df = pd.DataFrame(columns=['Sample', 'Assembler', 'Contigs', "Basepairs", "Max Contig Size", "N", 'N50', 'Misassemblies', "Mapped Reads"])

for sample in sorted(global_pipeline_metrics_df['sample'].unique(), key=lambda v: v.upper(), reverse=True):
    for assembler in sorted(global_pipeline_metrics_df['assembler'].unique(), key=lambda v: v.upper(), reverse=True):
        
        #contigs
        contigs = global_pipeline_metrics_df['contigs'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler)]
        diff_contigs = np.diff(contigs)
        diff_contigs = (round(min(diff_contigs),2), round(max(diff_contigs),2))
        if len(set(diff_contigs)) == 1:
            diff_contigs = diff_contigs[0]
        
        #basepairs
        basepairs = global_pipeline_metrics_df['basepairs'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler)]
        diff_basepairs = np.diff(basepairs)
        diff_basepairs = (round(min(diff_basepairs),2), round(max(diff_basepairs),2))
        if len(set(diff_basepairs)) == 1:
            diff_basepairs = diff_basepairs[0]
            
        #max contig size
        max_contig = global_pipeline_metrics_df['max_contig'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler)]
        diff_max_contig = np.diff(max_contig)
        diff_max_contig = (round(min(diff_max_contig),2), round(max(diff_max_contig),2))
        if len(set(diff_max_contig)) == 1:
            diff_max_contig = diff_max_contig[0]
        
        #Ns
        N = global_pipeline_metrics_df['Ns'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler)]
        diff_N = np.diff(N)
        diff_N = (round(min(diff_N),2), round(max(diff_N),2))
        if len(set(diff_N)) == 1:
            diff_N = diff_N[0]
        
        #n50
        n50 = global_pipeline_metrics_df['n50'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler)]
        diff_n50 = np.diff(n50)
        diff_n50 = (round(min(diff_n50), 2), round(max(diff_n50),2))
        if len(set(diff_n50)) == 1:
            diff_n50 = diff_n50[0]
        
        #misassembly
        misassembly = global_pipeline_metrics_df['misassemblies'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler)]
        diff_misassembly = np.diff(misassembly)
        diff_misassembly = (round(min(diff_misassembly),2), round(max(diff_misassembly),2))
        if len(set(diff_misassembly)) == 1:
            diff_misassembly = diff_misassembly[0]
        
        #mapped_reads
        mapped_reads = global_pipeline_metrics_df['mapped_reads'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler)]
        diff_mapped_reads = np.diff(mapped_reads)
        diff_mapped_reads = (round(min(diff_mapped_reads),2), round(max(diff_mapped_reads),2))
        if len(set(diff_mapped_reads)) == 1:
            diff_mapped_reads = diff_mapped_reads[0]
        
        #combine
        df = df.append({ 'Assembler': assembler, 'Sample': sample, 'Contigs': diff_contigs, "Basepairs": diff_basepairs, 
                        "Max Contig Size": diff_max_contig, "N": diff_N, 'N50': diff_n50, 'Misassemblies': diff_misassembly, 
                        "Mapped Reads": diff_mapped_reads}, ignore_index=True)

df

Unnamed: 0,Sample,Assembler,Contigs,Basepairs,Max Contig Size,N,N50,Misassemblies,Mapped Reads
0,LNN,VelvetOptimizer,0,0,0,0,0,0,0
1,LNN,Unicycler,0,0,0,0,0,0,0
2,LNN,SPAdes,0,0,0,0,0,0,0
3,LNN,SKESA,0,0,0,0,0,0,0
4,LNN,MINIA,"(-17, 14)","(-1018, 874)",0,0,0,0,-0
5,LNN,metaSPAdes,0,0,0,0,0,0,0
6,LNN,MEGAHIT,0,0,0,0,0,0,-0
7,LNN,IDBA-UD,"(-2, 0)","(-410, 0)",0,0,0,0,-0
8,LNN,GATBMiniaPipeline,0,0,0,0,0,0,-0
9,LNN,BCALM2,0,0,0,0,0,0,"(-0.01, 0.03)"


In [21]:
df.to_csv('Global metrics.csv', index=False)

## Reference Metrics

In [22]:
global_pipeline_metrics_df = pd.DataFrame(columns=METRICS_COLUMNS_REFERENCE)

for pipeline_report_file in report_glob:
    report_file_name = pipeline_report_file.split('/')[-1]
    stats_run = pipeline_report_file.split('/')[-3]
    print('Processing {0} data from {1}...'.format(report_file_name, stats_run))
    
    with open(pipeline_report_file) as _fh:
        json_report = json.load(_fh)
        for sample in json_report.keys():
            for reference, data in json_report[sample]['ReferenceTables'].items():
                for row in data:
                    for item in row:
                        global_pipeline_metrics_df = global_pipeline_metrics_df.append({'run': stats_run,
                                                                                        'sample': sample,
                                                                                        'assembler': item['assembler'],
                                                                                        'reference': reference,
                                                                                        'contiguity': item['contiguity'],
                                                                                        'breadth_of_coverage': item['breadth_of_coverage'],
                                                                                        'multiplicity': item['multiplicity'],
                                                                                        'validity': item['validity'],
                                                                                        'parsimony': item['parsimony'],
                                                                                        'identity': item['identity'],
                                                                                        'lowest_identity': item['lowest_identity'],
                                                                                        'L90': item['L90'],
                                                                                        'contigs': item['aligned_contigs'],
                                                                                        'NA50': item['NA50'],
                                                                                        'NG50': item['NG50'],
                                                                                        'basepairs': item['aligned_basepairs'],
                                                                                        'Ns': item['Ns'],
                                                                                        'misassemblies': item['misassembled_contigs']},
                                                                                       ignore_index=True)

display(global_pipeline_metrics_df)

Processing pipeline_report_tables.json data from run1...
Processing pipeline_report_tables.json data from run3...
Processing pipeline_report_tables.json data from run2...


Unnamed: 0,run,sample,assembler,reference,contiguity,breadth_of_coverage,multiplicity,validity,parsimony,identity,lowest_identity,L90,contigs,NA50,NG50,basepairs,Ns,misassemblies
0,run1,ERR2984773,BCALM2,Bacillus_subtilis,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0
1,run1,ERR2984773,GATBMiniaPipeline,Bacillus_subtilis,0.189942,0.984241,1.00009,0.999988,1.00011,0.999809,0.995327,15,30,263420,257358,3981922,0,0
2,run1,ERR2984773,IDBA-UD,Bacillus_subtilis,0.031121,0.983005,1.00329,0.99998,1.00331,0.999756,0.964608,120,184,34330,34330,3976919,0,0
3,run1,ERR2984773,MEGAHIT,Bacillus_subtilis,0.103094,0.986119,1.00045,0.988904,1.01168,0.978796,0.420899,14,31,294549,294549,3989518,0,2
4,run1,ERR2984773,metaSPAdes,Bacillus_subtilis,0.105960,0.982677,1.00071,0.999909,1.0008,0.9986,0.965982,23,50,164894,164894,3975594,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2155,run2,LHS,MINIA,Staphylococcus_aureus_plasmid3,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0
2156,run2,LHS,SKESA,Staphylococcus_aureus_plasmid3,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0
2157,run2,LHS,SPAdes,Staphylococcus_aureus_plasmid3,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0
2158,run2,LHS,Unicycler,Staphylococcus_aureus_plasmid3,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
df = pd.DataFrame(columns=['Sample', 'Assembler', 'Reference', "Contiguity", "Breadth of Coverage", "Multiplicity", 'Validity', 
                           'Parsimony', "Identity", "lowest Identity", "L90", "Contigs", "NA50", "NG50", "Basepairs",
                          "Ns", "Misassemblies"])

for sample in sorted(global_pipeline_metrics_df['sample'].unique(), key=lambda v: v.upper(), reverse=True):
    for assembler in sorted(global_pipeline_metrics_df['assembler'].unique(), key=lambda v: v.upper(), reverse=True):
        for reference in sorted(global_pipeline_metrics_df['reference'].unique(), key=lambda v: v.upper(), reverse=True):
        
            #contiguity
            contiguity = global_pipeline_metrics_df['contiguity'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler) & (global_pipeline_metrics_df['reference'] == reference)]
            diff_contiguity = np.diff(contiguity)
            diff_contiguity = (round(min(diff_contiguity),2), round(max(diff_contiguity),2))
            if len(set(diff_contiguity)) == 1:
                diff_contiguity = diff_contiguity[0]
            
            #boc
            boc = global_pipeline_metrics_df['breadth_of_coverage'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler) & (global_pipeline_metrics_df['reference'] == reference)]
            diff_boc = np.diff(boc)
            diff_boc = (round(min(diff_boc),2), round(max(diff_boc),2))
            if len(set(diff_boc)) == 1:
                diff_boc = diff_boc[0]
            
            #multiplicity
            multiplicity = global_pipeline_metrics_df['multiplicity'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler) & (global_pipeline_metrics_df['reference'] == reference)]
            diff_multiplicity = np.diff(multiplicity)
            diff_multiplicity = (round(min(diff_multiplicity),2), round(max(diff_multiplicity),2))
            if len(set(diff_multiplicity)) == 1:
                diff_multiplicity = diff_multiplicity[0]
            
            #validity
            validity = global_pipeline_metrics_df['validity'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler) & (global_pipeline_metrics_df['reference'] == reference)]
            diff_validity = np.diff(validity)
            diff_validity = (round(min(diff_validity),2), round(max(diff_validity),2))
            if len(set(diff_validity)) == 1:
                diff_validity = diff_validity[0]
            
            #parsimony
            parsimony = global_pipeline_metrics_df['parsimony'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler) & (global_pipeline_metrics_df['reference'] == reference)]
            diff_parsimony = np.diff(parsimony)
            diff_parsimony = (round(min(diff_parsimony),2), round(max(diff_parsimony),2))
            if len(set(diff_parsimony)) == 1:
                diff_parsimony = diff_parsimony[0]
            
            #identity
            identity = global_pipeline_metrics_df['identity'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler) & (global_pipeline_metrics_df['reference'] == reference)]
            diff_identity = np.diff(identity)
            diff_identity = (round(min(diff_identity),2), round(max(diff_identity),2))
            if len(set(diff_identity)) == 1:
                diff_identity = diff_identity[0]
            
            #lowest_identity
            lowest_identity = global_pipeline_metrics_df['lowest_identity'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler) & (global_pipeline_metrics_df['reference'] == reference)]
            diff_lowest_identity = np.diff(lowest_identity)
            diff_lowest_identity = (round(min(diff_lowest_identity),2), round(max(diff_lowest_identity),2))
            if len(set(diff_lowest_identity)) == 1:
                diff_lowest_identity = diff_lowest_identity[0]
            
            #L90
            L90 = global_pipeline_metrics_df['L90'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler) & (global_pipeline_metrics_df['reference'] == reference)]
            diff_L90 = np.diff(L90)
            diff_L90 = (round(min(diff_L90),2), round(max(diff_L90),2))
            if len(set(diff_L90)) == 1:
                diff_L90 = diff_L90[0]
            
            #contigs
            contigs = global_pipeline_metrics_df['contigs'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler) & (global_pipeline_metrics_df['reference'] == reference)]
            diff_contigs = np.diff(contigs)
            diff_contigs = (round(min(diff_contigs),2), round(max(diff_contigs),2))
            if len(set(diff_contigs)) == 1:
                diff_contigs = diff_contigs[0]
            
            #NA50
            NA50 = global_pipeline_metrics_df['NA50'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler) & (global_pipeline_metrics_df['reference'] == reference)]
            diff_NA50 = np.diff(NA50)
            diff_NA50 = (round(min(diff_NA50),2), round(max(diff_NA50),2))
            if len(set(diff_NA50)) == 1:
                diff_NA50 = diff_NA50[0]
            
            #NG50
            NG50 = global_pipeline_metrics_df['NG50'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler) & (global_pipeline_metrics_df['reference'] == reference)]
            diff_NG50 = np.diff(NG50)
            diff_NG50 = (round(min(diff_NG50),2), round(max(diff_NG50),2))
            if len(set(diff_NG50)) == 1:
                diff_NG50 = diff_NG50[0]
            
            #basepairs
            basepairs = global_pipeline_metrics_df['basepairs'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler) & (global_pipeline_metrics_df['reference'] == reference)]
            diff_basepairs = np.diff(basepairs)
            diff_basepairs = (round(min(diff_basepairs),2), round(max(diff_basepairs),2))
            if len(set(diff_basepairs)) == 1:
                diff_basepairs = diff_basepairs[0]
            
            #Ns
            Ns = global_pipeline_metrics_df['Ns'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler) & (global_pipeline_metrics_df['reference'] == reference)]
            diff_Ns = np.diff(Ns)
            diff_Ns = (round(min(diff_Ns),2), round(max(diff_Ns),2))
            if len(set(diff_Ns)) == 1:
                diff_Ns = diff_Ns[0]
            
            #Misassemblies
            Misassemblies = global_pipeline_metrics_df['misassemblies'][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df['assembler']==assembler) & (global_pipeline_metrics_df['reference'] == reference)]
            diff_Misassemblies = np.diff(Misassemblies)
            diff_Misassemblies = (round(min(diff_Misassemblies),2), round(max(diff_Misassemblies),2))
            if len(set(diff_Misassemblies)) == 1:
                diff_Misassemblies = diff_Misassemblies[0]
        
            #combine,
            df = df.append({ 'Assembler': assembler, 'Sample': sample, 'Reference': reference, 'Contiguity': diff_contiguity, 
                            "Breadth of Coverage": diff_boc, "Multiplicity": diff_multiplicity, 'Validity': diff_validity,
                            'Parsimony': diff_parsimony, "Identity": diff_identity, "lowest Identity": diff_lowest_identity,
                            "L90": diff_L90, "Contigs": diff_contigs, "NA50": diff_NA50, "NG50": diff_NG50,  "Basepairs": diff_basepairs,
                            "Ns": diff_Ns, "Misassemblies": diff_Misassemblies}, ignore_index=True)

df

Unnamed: 0,Sample,Assembler,Reference,Contiguity,Breadth of Coverage,Multiplicity,Validity,Parsimony,Identity,lowest Identity,L90,Contigs,NA50,NG50,Basepairs,Ns,Misassemblies
0,LNN,VelvetOptimizer,Staphylococcus_aureus_plasmid3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,LNN,VelvetOptimizer,Staphylococcus_aureus_plasmid2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,LNN,VelvetOptimizer,Staphylococcus_aureus_plasmid1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,LNN,VelvetOptimizer,Staphylococcus_aureus,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,LNN,VelvetOptimizer,Salmonella_enterica,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,EHS,BCALM2,Lactobacillus_fermentum,0,0,0,0,0,0,0,0,0,0,0,0,0,0
716,EHS,BCALM2,Escherichia_coli_plasmid,0,0,0,0,0,0,0,0,0,0,0,0,0,0
717,EHS,BCALM2,Escherichia_coli,0,0,0,0,0,0,0,0,0,0,0,0,0,0
718,EHS,BCALM2,Enterococcus_faecalis,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
df.to_csv('Reference metrics.csv', index=False)