# 3 Compute Results Overview Tables - By category (algorithm)

The success of an assembly is evaluated by the computation of metrics in two defined ways: globally through statistics inherent to the complete set of sequences that were assembled, and relative to the replicons present in the sample. 

The following metrics are computed for the complete and filtered set of assembled sequences, restricted to contigs of length above a specified minimum size: 

- **Contig sizes**
    - **Contigs:** The total number of contigs in the assembly;
    - **Basepairs:** The total number of bases in the assembly;
    - **Maximum sequence length:** The length of the largest contig in the assembly.
    - **Number of ‘N’s:** Number of uncalled bases (N's) 
- **Contiguity**
    - **Nx (where 0  < x  ⩽ 100):** Length for which the collection of all assembled sequences of that length or longer in an assembly covers at least a given percentage of the total length of the assembly
- **Misassembly**
    - **Misassemblies** - Number of aligned contigs that contain a misassembly event


## Imports

In [1]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np

## Global variables

In [2]:
ASSEMBLER_PROCESS_LIST = ["ABYSS", "BCALM2", "GATBMINIAPIPELINE", "METAHIPMER2", "MINIA", "MEGAHIT", "METASPADES", "UNICYCLER", "SPADES",
                          "SKESA", "VELVETOPTIMISER", "IDBA"]

PROCESS_TO_NAME = {"ABYSS": "AbYSS",
                   "BCALM2": "BCALM2", 
                   "GATBMINIAPIPELINE": "GATBMiniaPipeline",
                   "METAHIPMER2": "MetaHipMer2",
                   "MINIA": "MINIA",
                   "MEGAHIT": "MEGAHIT", 
                   "METASPADES": "metaSPAdes", 
                   "UNICYCLER": "Unicycler", 
                   "SPADES": "SPAdes",
                    "SKESA": "SKESA",
                   "VELVETOPTIMISER": "VelvetOptimiser",
                   "IDBA": "IDBA-UD"}

log_distributed = ['LHS', 'LNN', 'ERR2935805']
even_distribution = ['EMS', 'LNN', 'ERR2984773']

skipped_assemblers = ['ABySS', 'BCALM2', 'MINIA', 'VelvetOptimiser', 'MetaHipMer2']

genomic_assemblers = ['ABySS', 'BCALM2', 'MINIA', 'SKESA', 'SPAdes', 'Unicycler', 'VelvetOptimiser']
metagenomic_assemblers = ['MetaHipMer2','GATBMiniaPipeline', 'IDBA-UD', 'MEGAHIT', 'metaSPAdes']
single_kmer = ['BCALM2', 'MINIA', 'ABySS']
multiple_kmer = ['SKESA', 'SPAdes', 'Unicycler', 'VelvetOptimizer', 'GATBMiniaPipeline', 
                 'IDBA-UD', 'MEGAHIT', 'metaSPAdes', 'MetaHipMer2']

best_min = ['Ns', 'contigs', 'filtered_Ns', 'filtered_contigs','misassembled contigs','misassembly events']
best_max = ['basepairs','filtered_basepairs','filtered_mapped_reads','filtered_n50','mapped_reads','max_contig','n50']

COLOURS = ['#5876c8', '#58AEC8', '#009392', '#39B185', '#9CCB86', '#E9E29C', '#EEB479', '#E88471', '#CF597E', '#a54765', '#a42a2a', '#835221', 'darkgray']

## Global metrics

### Load Data

In [3]:
report_glob = glob.glob('../Results/*/*/report/pipeline_report_tables.json')
global_pipeline_metrics_df = pd.DataFrame()

for pipeline_report_file in report_glob:
    report_file_name = pipeline_report_file.split('/')[-1]
    stats_run = pipeline_report_file.split('/')[-3]
    print('Processing {0} data from {1}...'.format(report_file_name, stats_run))
    
    with open(pipeline_report_file) as _fh:
        json_report = json.load(_fh)
        for sample in json_report.keys():
            for line in json_report[sample]['GlobalTable']:
                assembler = line['assembler']
                global_pipeline_metrics_df = global_pipeline_metrics_df.append({'run': stats_run,
                                                                                'sample': sample,
                                                                                'assembler': line['assembler'],
                                                                                'contigs': int(line['original']['contigs']),
                                                                                'basepairs': int(line['original']['basepairs']),
                                                                                'max_contig': int(line['original']['max_contig_size']),
                                                                                'n50': int(line['original']['N50']),
                                                                                'mapped_reads': line['original']['mapped_reads'],
                                                                                'Ns': int(line['original']['Ns']),
                                                                                'misassembled contigs': line['filtered']['misassembled_contigs'],
                                                                                'misassembly events': line['filtered']['misassembly_events'],
                                                                                'filtered_contigs': line['filtered']['contigs'],
                                                                                'filtered_basepairs': line['filtered']['basepairs'],
                                                                                'filtered_n50': line['filtered']['N50'],
                                                                                'filtered_Ns': line['filtered']['Ns'],
                                                                                'filtered_mapped_reads': line['filtered']['mapped_reads'],
                                                                               },
                                                                               ignore_index=True)
global_pipeline_metrics_df['distribution'] = np.where(global_pipeline_metrics_df['sample'].isin(log_distributed), 'Log', 'Even')
global_pipeline_metrics_df['type'] = np.where(global_pipeline_metrics_df['assembler'].isin(genomic_assemblers), 'Genomic', 'Metagenomic')
global_pipeline_metrics_df['algorythm'] = np.where(global_pipeline_metrics_df['assembler'].isin(single_kmer), 'Single k-mer De Bruijn graph', 'Multiple k-mer De Bruijn graph')
global_pipeline_metrics_df[['contigs','basepairs','max_contig', 'Ns','n50', 'filtered_n50','misassembled contigs', 'misassembly events']] = global_pipeline_metrics_df[['contigs','basepairs','max_contig', 'Ns','n50','filtered_n50','misassembled contigs', 'misassembly events']].apply(pd.to_numeric)

Processing pipeline_report_tables.json data from run3...
Processing pipeline_report_tables.json data from run2...
Processing pipeline_report_tables.json data from run1...
Processing pipeline_report_tables.json data from run3...
Processing pipeline_report_tables.json data from run2...


### Global statistics per assembler algorithm

In [4]:
print(global_pipeline_metrics_df['assembler'][global_pipeline_metrics_df['algorythm'] == "Single k-mer De Bruijn graph"].unique())
print(global_pipeline_metrics_df['assembler'][global_pipeline_metrics_df['algorythm'] == "Multiple k-mer De Bruijn graph"].unique())

['ABySS' 'BCALM2' 'MINIA']
['GATBMiniaPipeline' 'IDBA-UD' 'MEGAHIT' 'MetaHipMer2' 'metaSPAdes'
 'SKESA' 'SPAdes' 'Unicycler' 'VelvetOptimiser']


In [5]:
for sample in global_pipeline_metrics_df['sample'].unique():
    print(sample)
    stats_per_assembler_type = pd.DataFrame()
    for assembly_type in global_pipeline_metrics_df.algorythm.unique():
        row={'Assembler Type': assembly_type}
        lala=global_pipeline_metrics_df[(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df.algorythm == assembly_type)].describe()
        for column in lala.columns:
            mean = lala.loc['mean',column]
            minimum = lala.loc['min',column]
            maximum = lala.loc['max',column]
            median = global_pipeline_metrics_df[column][(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df.algorythm == assembly_type)].median()
            row[column] = "{} [{};{}]".format(int(median), int(minimum), int(maximum))
        stats_per_assembler_type=stats_per_assembler_type.append(row, ignore_index=True)
    stats_per_assembler_type = stats_per_assembler_type.set_index('Assembler Type')
    display(stats_per_assembler_type)
    stats_per_assembler_type.to_csv("Tables/Results/Global metrics per assembler algorithm - {}.csv".format(sample))

LNN


Unnamed: 0_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Single k-mer De Bruijn graph,0 [0;1347],19635512 [16401028;20132631],44619 [15000;44825],0 [0;400],13501500 [13397815;13573790],784 [762;1212],97 [97;98],33550 [31436;192306],99 [98;99],177038 [163543;851314],0 [0;2],0 [0;4],18682 [9569;26474]
Multiple k-mer De Bruijn graph,0 [0;198470],14324920 [0;19638608],1987 [0;69092],0 [0;27980],13642314 [0;13704539],209 [0;3016],98 [0;99],140561 [0;242687],99 [0;99],632639 [0;1080013],1 [0;12],2 [0;25],122703 [0;242687]


ERR2935805


Unnamed: 0_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Single k-mer De Bruijn graph,0 [0;9],78727717 [20757924;315567452],677853 [37281;8035706],0 [0;0],3150980 [657074;13448666],504 [356;1466],79 [0;84],18288 [1262;18568],87 [1;91],88665 [3093;163543],0 [0;2],0 [0;3],101 [35;7485]
Multiple k-mer De Bruijn graph,0 [0;0],23587252 [8818742;47958628],14156 [205;525998],0 [0;0],13912407 [409222;19595813],2889 [144;5038],87 [0;89],87983 [1180;256884],88 [1;89],622107 [2044;847683],1 [0;572],2 [0;1148],3674 [88;256884]


LHS


Unnamed: 0_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Single k-mer De Bruijn graph,0 [0;0],15335108 [0;364761201],24664 [0;8175890],0 [0;0],11967 [0;12922301],11 [0;2172],0 [0;81],1062 [0;10582],6 [0;91],1212 [0;163543],0 [0;0],0 [0;0],49 [0;7727]
Multiple k-mer De Bruijn graph,0 [0;0],13791670 [0;22242266],988 [0;240521],0 [0;0],13315624 [0;13672885],327 [0;3237],96 [0;97],108024 [0;351027],96 [0;97],632213 [0;917728],1 [0;5],2 [0;10],106347 [0;351027]


EMS


Unnamed: 0_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Single k-mer De Bruijn graph,0 [0;883],34563847 [30648944;39571553],32430 [16748;252182],0 [0;612],28218370 [16407295;30988200],3574 [497;6250],92 [27;98],20782 [3056;138706],95 [65;98],161812 [38021;841100],0 [0;2],0 [0;3],17967 [645;124246]
Multiple k-mer De Bruijn graph,0 [0;4383],30482042 [30249132;30664883],979 [656;11069],0 [0;4383],30262824 [28467054;30418717],657 [334;3035],97 [92;98],122545 [33291;245050],98 [95;99],768640 [468323;1303875],1 [0;13],3 [0;22],121957 [29176;245050]


ERR2984773


Unnamed: 0_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Single k-mer De Bruijn graph,0 [0;149],91697404 [0;170402348],497783 [0;3508016],0 [0;7],28276538 [0;30657287],1196 [0;3983],74 [0;80],15817 [0;52869],80 [0;84],117945 [0;298093],0 [0;2],0 [0;6],139 [0;7106]
Multiple k-mer De Bruijn graph,0 [0;733],39778575 [13174826;57130377],20189 [895;652127],0 [0;0],30564352 [13883;32500826],1309 [13;2654],81 [0;82],95101 [1039;191424],83 [1;84],473277 [1157;1205839],2 [0;15],3 [0;32],53292 [85;191424]


ENN


Unnamed: 0_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Single k-mer De Bruijn graph,0 [0;3884],31047581 [30641333;31513997],16570 [4769;25377],0 [0;2328],28205937 [28174748;30746773],3575 [1822;3589],92 [92;98],20700 [20628;89282],97 [96;99],161812 [161812;952618],0 [0;8],0 [0;14],17820 [17395;86573]
Multiple k-mer De Bruijn graph,0 [0;8788],30440762 [30110758;30545419],1100 [640;10924],0 [0;8283],30264931 [28485306;30420265],719 [359;2994],97 [93;99],102049 [34664;238611],99 [97;99],632274 [468359;1036942],3 [0;18],6 [0;36],99990 [30453;238611]
