### 3 Compute Results Overview Tables - By category

The success of an assembly is evaluated by the computation of metrics in two defined ways: globally through statistics inherent to the complete set of sequences that were assembled, and relative to the replicons present in the sample. 

The following metrics are computed for the complete and filtered set of assembled sequences, restricted to contigs of length above a specified minimum size: 

- **Contig sizes**
    - **Contigs:** The total number of contigs in the assembly;
    - **Basepairs:** The total number of bases in the assembly;
    - **Maximum sequence length:** The length of the largest contig in the assembly.
    - **Number of ‘N’s:** Number of uncalled bases (N's) 
- **Contiguity**
    - **Nx (where 0  < x  ⩽ 100):** Length for which the collection of all assembled sequences of that length or longer in an assembly covers at least a given percentage of the total length of the assembly
- **Misassembly**
    - **Misassemblies** - Number of aligned contigs that contain a misassembly event


## Imports

In [20]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np

## Global variables

In [21]:
ASSEMBLER_PROCESS_LIST = ["ABYSS", "BCALM2", "GATBMINIAPIPELINE", "METAHIPMER2", "MINIA", "MEGAHIT", "METASPADES", "UNICYCLER", "SPADES",
                          "SKESA", "VELVETOPTIMISER", "IDBA"]

PROCESS_TO_NAME = {"ABYSS": "AbYSS",
                   "BCALM2": "BCALM2", 
                   "GATBMINIAPIPELINE": "GATBMiniaPipeline",
                   "METAHIPMER2": "MetaHipMer2",
                   "MINIA": "MINIA",
                   "MEGAHIT": "MEGAHIT", 
                   "METASPADES": "metaSPAdes", 
                   "UNICYCLER": "Unicycler", 
                   "SPADES": "SPAdes",
                    "SKESA": "SKESA",
                   "VELVETOPTIMISER": "VelvetOptimiser",
                   "IDBA": "IDBA-UD"}

log_distributed = ['LHS', 'LNN', 'ERR2935805']
even_distribution = ['EMS', 'LNN', 'ERR2984773']

skipped_assemblers = ['ABySS', 'BCALM2', 'MINIA', 'VelvetOptimiser', 'MetaHipMer2']

genomic_assemblers = ['ABySS', 'BCALM2', 'MINIA', 'SKESA', 'SPAdes', 'Unicycler', 'VelvetOptimiser']
metagenomic_assemblers = ['MetaHipMer2','GATBMiniaPipeline', 'IDBA-UD', 'MEGAHIT', 'metaSPAdes']
single_kmer = ['BCALM2', 'MINIA', 'ABySS']
multiple_kmer = ['SKESA', 'SPAdes', 'Unicycler', 'VelvetOptimizer', 'GATBMiniaPipeline', 
                 'IDBA-UD', 'MEGAHIT', 'metaSPAdes', 'MetaHipMer2']

best_min = ['Ns', 'contigs', 'filtered_Ns', 'filtered_contigs','misassembled contigs','misassembly events']
best_max = ['basepairs','filtered_basepairs','filtered_mapped_reads','filtered_n50','mapped_reads','max_contig','n50']

COLOURS = ['#5876c8', '#58AEC8', '#009392', '#39B185', '#9CCB86', '#E9E29C', '#EEB479', '#E88471', '#CF597E', '#a54765', '#a42a2a', '#835221', 'darkgray']

## Global metrics

### Load Data

In [22]:
report_glob = glob.glob('../Results/*/*/report/pipeline_report_tables.json')
global_pipeline_metrics_df = pd.DataFrame()

for pipeline_report_file in report_glob:
    report_file_name = pipeline_report_file.split('/')[-1]
    stats_run = pipeline_report_file.split('/')[-3]
    print('Processing {0} data from {1}...'.format(report_file_name, stats_run))
    
    with open(pipeline_report_file) as _fh:
        json_report = json.load(_fh)
        for sample in json_report.keys():
            for line in json_report[sample]['GlobalTable']:
                assembler = line['assembler']
                if assembler not in skipped_assemblers:
                    global_pipeline_metrics_df = global_pipeline_metrics_df.append({'run': stats_run,
                                                                                    'sample': sample,
                                                                                    'assembler': line['assembler'],
                                                                                    'contigs': int(line['original']['contigs']),
                                                                                    'basepairs': int(line['original']['basepairs']),
                                                                                    'max_contig': int(line['original']['max_contig_size']),
                                                                                    'n50': int(line['original']['N50']),
                                                                                    'mapped_reads': line['original']['mapped_reads'],
                                                                                    'Ns': int(line['original']['Ns']),
                                                                                    'misassembled contigs': line['filtered']['misassembled_contigs'],
                                                                                    'misassembly events': line['filtered']['misassembly_events'],
                                                                                    'filtered_contigs': line['filtered']['contigs'],
                                                                                    'filtered_basepairs': line['filtered']['basepairs'],
                                                                                    'filtered_n50': line['filtered']['N50'],
                                                                                    'filtered_Ns': line['filtered']['Ns'],
                                                                                    'filtered_mapped_reads': line['filtered']['mapped_reads'],
                                                                                   },
                                                                                   ignore_index=True)
global_pipeline_metrics_df['distribution'] = np.where(global_pipeline_metrics_df['sample'].isin(log_distributed), 'Log', 'Even')
global_pipeline_metrics_df['type'] = np.where(global_pipeline_metrics_df['assembler'].isin(genomic_assemblers), 'Genomic', 'Metagenomic')
global_pipeline_metrics_df['algorythm'] = np.where(global_pipeline_metrics_df['assembler'].isin(single_kmer), 'Single k-mer De Bruijn graph', 'Multiple k-mer De Bruijn graph')
global_pipeline_metrics_df[['contigs','basepairs','max_contig', 'Ns','n50', 'filtered_n50','misassembled contigs', 'misassembly events']] = global_pipeline_metrics_df[['contigs','basepairs','max_contig', 'Ns','n50','filtered_n50','misassembled contigs', 'misassembly events']].apply(pd.to_numeric)

Processing pipeline_report_tables.json data from run3...
Processing pipeline_report_tables.json data from run2...
Processing pipeline_report_tables.json data from run1...
Processing pipeline_report_tables.json data from run3...
Processing pipeline_report_tables.json data from run2...


### Global statistics per assembler type

In [23]:
print(global_pipeline_metrics_df['assembler'][global_pipeline_metrics_df['type'] == "Metagenomic"].unique())
print(global_pipeline_metrics_df['assembler'][global_pipeline_metrics_df['type'] == "Genomic"].unique())

['GATBMiniaPipeline' 'IDBA-UD' 'MEGAHIT' 'metaSPAdes']
['SKESA' 'SPAdes' 'Unicycler']


#### Descriptive statistics

In [24]:
for sample in global_pipeline_metrics_df['sample'].unique():
    print(sample)
    stats_per_assembler_type = pd.DataFrame()
    for assembly_type in global_pipeline_metrics_df.type.unique():
        row={'Assembler Type': assembly_type}
        lala=global_pipeline_metrics_df[(global_pipeline_metrics_df['sample'] == sample) & (global_pipeline_metrics_df.type == assembly_type) & (global_pipeline_metrics_df.algorythm == "Multiple k-mer De Bruijn graph")].describe()
        for column in lala.columns:
            mean = lala.loc['mean',column]
            minimum = lala.loc['min',column]
            maximum = lala.loc['max',column]
            row[column] = "{} [{};{}]".format(round(mean, 2), int(minimum), int(maximum))
        stats_per_assembler_type=stats_per_assembler_type.append(row, ignore_index=True)
    stats_per_assembler_type = stats_per_assembler_type.set_index('Assembler Type')
    display(stats_per_assembler_type)
    stats_per_assembler_type.to_csv("Tables/Results/Global metrics per assembler type multiple kmer - {}.csv".format(sample))

LNN


Unnamed: 0_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Metagenomic,0.0 [0;0],14538306.17 [13910838;15514698],2805.5 [693;6700],0.0 [0;0],13674441.5 [13642314;13704539],197.75 [139;261],99.26 [98;99],174695.5 [108043;239185],99.74 [99;99],786984.25 [585610;1080013],1.25 [1;2],2.5 [2;3],166676.5 [105808;220609]
Genomic,0.0 [0;0],11842809.33 [8202941;15606257],5676.67 [185;9514],0.0 [0;0],9933831.0 [4470092;13640340],590.67 [112;1433],96.65 [91;99],128918.67 [7385;242687],98.01 [95;99],513148.33 [59272;847534],4.0 [0;12],7.67 [0;23],122181.67 [1155;242687]


ERR2935805


Unnamed: 0_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Metagenomic,0.0 [0;0],23261978.08 [17126362;27822010],15346.5 [6776;26282],0.0 [0;0],17196398.92 [13891328;18792639],2441.0 [603;3392],89.25 [88;89],125805.5 [87983;210993],89.72 [89;89],641431.75 [542997;768410],1.92 [0;3],3.83 [0;7],48201.58 [3674;82249]
Genomic,0.0 [0;0],16027898.0 [8818742;29517365],15080.33 [205;36214],0.0 [0;0],11707517.67 [5801439;19595813],2677.33 [144;4999],69.37 [57;87],104832.33 [2148;256884],84.65 [78;88],574806.0 [29155;847683],190.67 [0;572],382.67 [0;1148],86762.33 [1431;256884]


LHS


Unnamed: 0_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Metagenomic,0.0 [0;0],13857760.08 [13638386;14157309],1219.83 [767;2099],0.0 [0;0],13613428.08 [13507672;13672885],460.25 [184;762],97.02 [96;97],141878.25 [91722;196786],97.23 [97;97],703601.5 [632213;917728],1.25 [0;2],3.5 [0;6],139935.5 [90987;193256]
Genomic,0.0 [0;0],10831933.33 [2957060;16214591],10446.67 [36;31040],0.0 [0;0],9784598.0 [2950393;13315624],596.67 [21;1526],88.5 [77;96],204425.33 [69361;351027],94.9 [92;96],581076.67 [263645;847572],2.0 [0;5],4.0 [0;10],196797.67 [46478;351027]


EMS


Unnamed: 0_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Metagenomic,0.0 [0;0],30499869.5 [30408932;30599129],1128.0 [761;1829],0.0 [0;0],30327620.0 [30145929;30418717],698.5 [365;1254],98.09 [97;98],141558.0 [63178;208969],98.85 [98;99],730818.5 [468323;917726],6.25 [1;13],11.75 [1;22],141059.42 [61704;208969]
Genomic,0.0 [0;0],30405294.0 [30249132;30664883],1517.67 [656;2628],0.0 [0;0],30202846.67 [30064811;30354786],545.67 [334;891],97.71 [97;98],176059.67 [74138;245050],98.37 [97;98],928688.0 [632013;1303875],3.0 [0;9],6.0 [0;18],175134.67 [73839;245050]


ERR2984773


Unnamed: 0_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Metagenomic,0.0 [0;0],40679477.25 [32809112;48964620],26959.67 [5397;61644],0.0 [0;0],31529139.67 [30564302;32021766],1637.75 [761;2654],82.1 [81;82],96809.5 [29134;149841],83.84 [83;84],687726.75 [303329;1205839],5.5 [1;15],10.92 [2;32],63437.0 [20993;102115]
Genomic,0.0 [0;0],35852992.33 [30430077;46535940],14022.0 [895;39819],0.0 [0;0],31048226.67 [30211740;32500826],1106.67 [425;2049],81.09 [80;81],122864.33 [77870;191424],82.59 [81;84],691238.33 [446163;882182],3.67 [0;9],7.33 [0;18],107213.33 [53292;191424]


ENN


Unnamed: 0_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Metagenomic,0.0 [0;0],30457631.25 [30390566;30545419],1094.0 [640;1822],0.0 [0;0],30301796.0 [30130413;30420265],744.25 [359;1279],98.67 [97;99],143644.25 [65108;238611],99.48 [99;99],721760.25 [585610;1036942],7.83 [2;18],16.17 [6;36],142765.75 [63725;238611]
Genomic,0.0 [0;0],30272311.33 [30110758;30442718],1466.0 [936;2322],0.0 [0;0],30068140.0 [29835112;30276227],871.0 [443;1726],97.97 [97;98],152200.67 [43168;208953],98.94 [98;99],776018.67 [630384;850138],4.33 [0;10],8.33 [0;19],150353.0 [42115;204481]


#### rank

In [25]:
best_stats_dfs = []

all_data = pd.DataFrame()

for sample in global_pipeline_metrics_df['sample'].unique():
    print(sample)
    best_stats_per_assembler = pd.DataFrame()

    for assembly_type in global_pipeline_metrics_df['type'].unique():
        
        best_row = {'Assembler Type': assembly_type, "Type": "Best"}
        worst_row = {'Assembler Type': assembly_type, "Type": "Worst"}
        
        describe_df = global_pipeline_metrics_df[(global_pipeline_metrics_df['type'] == assembly_type) & (global_pipeline_metrics_df['sample'] == sample)].describe()

        for column in describe_df.columns:
            mean = describe_df.loc['mean',column]
            minimum = describe_df.loc['min',column]
            maximum = describe_df.loc['max',column]
                        
            if "basepairs" in column:
                target = 30946587
                best_row[column] = min(list(global_pipeline_metrics_df[column][(global_pipeline_metrics_df['type'] == assembly_type) & (global_pipeline_metrics_df['sample'] == sample)]), key=lambda x:abs(x-target))
                worst_row[column] = max(list(global_pipeline_metrics_df[column][(global_pipeline_metrics_df['type'] == assembly_type) & (global_pipeline_metrics_df['sample'] == sample)]), key=lambda x:abs(x-target))

            elif column in best_min:
                best_row[column] = minimum
                worst_row[column] = maximum
            else:
                best_row[column] = maximum
                worst_row[column] = minimum
                
        best_stats_per_assembler=best_stats_per_assembler.append(best_row, ignore_index=True)
        best_stats_per_assembler=best_stats_per_assembler.append(worst_row, ignore_index=True)

        best_row['Sample'] = sample
        worst_row['Sample'] = sample
        all_data = all_data.append(best_row, ignore_index=True)
        all_data = all_data.append(worst_row, ignore_index=True)
    
    best_stats_per_assembler = best_stats_per_assembler.set_index(['Assembler Type', 'Type'])
    display(best_stats_per_assembler)
    best_stats_per_assembler.to_csv("Tables/Results/Best Global metrics per assembler type multiple kmer - {}.csv".format(sample))
    best_stats_dfs.append([sample, best_stats_per_assembler])

all_data = all_data.set_index(['Sample', 'Assembler Type', 'Type'])
display(all_data)
all_data.to_csv("Tables/Best Global metrics per assembler type multiple kmer")

LNN


Unnamed: 0_level_0,Unnamed: 1_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Metagenomic,Best,0.0,15514698.0,693.0,0.0,13704539.0,139.0,99.686868,239185.0,99.798003,1080013.0,1.0,2.0,220609.0
Metagenomic,Worst,0.0,13910838.0,6700.0,0.0,13642314.0,261.0,98.807171,108043.0,99.694877,585610.0,2.0,3.0,105808.0
Genomic,Best,0.0,15606257.0,185.0,0.0,13640340.0,112.0,99.591855,242687.0,99.713909,847534.0,0.0,0.0,242687.0
Genomic,Worst,0.0,8202941.0,9514.0,0.0,4470092.0,1433.0,91.766425,7385.0,95.10133,59272.0,12.0,23.0,1155.0


ERR2935805


Unnamed: 0_level_0,Unnamed: 1_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Metagenomic,Best,0.0,27822010.0,6776.0,0.0,18792639.0,603.0,89.502391,210993.0,89.926254,768410.0,0.0,0.0,82249.0
Metagenomic,Worst,0.0,17126362.0,26282.0,0.0,13891328.0,3392.0,88.896022,87983.0,89.404535,542997.0,3.0,7.0,3674.0
Genomic,Best,0.0,29517365.0,205.0,0.0,19595813.0,144.0,87.475243,256884.0,88.056194,847683.0,0.0,0.0,256884.0
Genomic,Worst,0.0,8818742.0,36214.0,0.0,5801439.0,4999.0,57.929408,2148.0,78.662557,29155.0,572.0,1148.0,1431.0


LHS


Unnamed: 0_level_0,Unnamed: 1_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Metagenomic,Best,0.0,14157309.0,767.0,0.0,13672885.0,184.0,97.182958,196786.0,97.286491,917728.0,0.0,0.0,193256.0
Metagenomic,Worst,0.0,13638386.0,2099.0,0.0,13507672.0,762.0,96.805106,91722.0,97.170882,632213.0,2.0,6.0,90987.0
Genomic,Best,0.0,16214591.0,36.0,0.0,13315624.0,21.0,96.406056,351027.0,96.974535,847572.0,0.0,0.0,351027.0
Genomic,Worst,0.0,2957060.0,31040.0,0.0,2950393.0,1526.0,77.081486,69361.0,92.228441,263645.0,5.0,10.0,46478.0


EMS


Unnamed: 0_level_0,Unnamed: 1_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Metagenomic,Best,0.0,30599129.0,761.0,0.0,30418717.0,365.0,98.647341,208969.0,99.053249,917726.0,1.0,1.0,208969.0
Metagenomic,Worst,0.0,30408932.0,1829.0,0.0,30145929.0,1254.0,97.298997,63178.0,98.642214,468323.0,13.0,22.0,61704.0
Genomic,Best,0.0,30664883.0,656.0,0.0,30354786.0,334.0,98.283354,245050.0,98.700052,1303875.0,0.0,0.0,245050.0
Genomic,Worst,0.0,30249132.0,2628.0,0.0,30064811.0,891.0,97.237351,74138.0,97.952533,632013.0,9.0,18.0,73839.0


ERR2984773


Unnamed: 0_level_0,Unnamed: 1_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Metagenomic,Best,0.0,32809112.0,5397.0,0.0,30564396.0,761.0,82.834929,149841.0,84.326108,1205839.0,1.0,2.0,102115.0
Metagenomic,Worst,0.0,48964620.0,61644.0,0.0,32021766.0,2654.0,81.309188,29134.0,83.144345,303329.0,15.0,32.0,20993.0
Genomic,Best,0.0,30592960.0,895.0,0.0,30432114.0,425.0,81.598094,191424.0,84.204433,882182.0,0.0,0.0,191424.0
Genomic,Worst,0.0,46535940.0,39819.0,0.0,32500826.0,2049.0,80.508554,77870.0,81.254167,446163.0,9.0,18.0,53292.0


ENN


Unnamed: 0_level_0,Unnamed: 1_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Metagenomic,Best,0.0,30545419.0,640.0,0.0,30420265.0,359.0,99.38811,238611.0,99.693845,1036942.0,2.0,6.0,238611.0
Metagenomic,Worst,0.0,30390566.0,1822.0,0.0,30130413.0,1279.0,97.922866,65108.0,99.340277,585610.0,18.0,36.0,63725.0
Genomic,Best,0.0,30442718.0,936.0,0.0,30276227.0,443.0,98.561253,208953.0,99.41944,850138.0,0.0,0.0,204481.0
Genomic,Worst,0.0,30110758.0,2322.0,0.0,29835112.0,1726.0,97.424467,43168.0,98.313391,630384.0,10.0,19.0,42115.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Sample,Assembler Type,Type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
LNN,Metagenomic,Best,0.0,15514698.0,693.0,0.0,13704539.0,139.0,99.686868,239185.0,99.798003,1080013.0,1.0,2.0,220609.0
LNN,Metagenomic,Worst,0.0,13910838.0,6700.0,0.0,13642314.0,261.0,98.807171,108043.0,99.694877,585610.0,2.0,3.0,105808.0
LNN,Genomic,Best,0.0,15606257.0,185.0,0.0,13640340.0,112.0,99.591855,242687.0,99.713909,847534.0,0.0,0.0,242687.0
LNN,Genomic,Worst,0.0,8202941.0,9514.0,0.0,4470092.0,1433.0,91.766425,7385.0,95.10133,59272.0,12.0,23.0,1155.0
ERR2935805,Metagenomic,Best,0.0,27822010.0,6776.0,0.0,18792639.0,603.0,89.502391,210993.0,89.926254,768410.0,0.0,0.0,82249.0
ERR2935805,Metagenomic,Worst,0.0,17126362.0,26282.0,0.0,13891328.0,3392.0,88.896022,87983.0,89.404535,542997.0,3.0,7.0,3674.0
ERR2935805,Genomic,Best,0.0,29517365.0,205.0,0.0,19595813.0,144.0,87.475243,256884.0,88.056194,847683.0,0.0,0.0,256884.0
ERR2935805,Genomic,Worst,0.0,8818742.0,36214.0,0.0,5801439.0,4999.0,57.929408,2148.0,78.662557,29155.0,572.0,1148.0,1431.0
LHS,Metagenomic,Best,0.0,14157309.0,767.0,0.0,13672885.0,184.0,97.182958,196786.0,97.286491,917728.0,0.0,0.0,193256.0
LHS,Metagenomic,Worst,0.0,13638386.0,2099.0,0.0,13507672.0,762.0,96.805106,91722.0,97.170882,632213.0,2.0,6.0,90987.0


In [34]:
rank_dfs = []

for sample,df in best_stats_dfs:
    print(sample)
    
    rank_df = pd.DataFrame(index=df.index)
    
    for column in df.columns:
        rank_list = []
        
        if "basepairs" in column:
            target = 30946587
            for item in df[column]:
                diff = max(0, 1 - (abs(item - target)/target))
                rank_list.append(diff)
            rank_df[column] = rank_list
        else:    
            if column in best_min:
                for item in df[column]:
                    diff = 1 - (item / df[column].max())
                    rank_list.append(diff)
            else:
                for item in df[column]:
                    diff = (item / df[column].max())
                    rank_list.append(diff)

            rank_df[column] = rank_list
    
    # in case assembly fails
    for assembler, row in df.iterrows():
        if row.sum() == 0:
            rank_df.at[assembler, :]=0

    rank_df = rank_df.fillna(1) #happens when all values are 0
    display(rank_df) 
    rank_df.to_csv("Tables/Results/Rank Global metrics per assembler - {}.csv".format(sample))
    rank_dfs.append([sample, rank_df])
            

LNN



invalid value encountered in double_scalars



Unnamed: 0_level_0,Unnamed: 1_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Metagenomic,Best,1.0,0.501338,0.92716,1.0,0.442845,0.903001,1.0,0.98557,1.0,1.0,0.916667,0.913043,0.909027
Metagenomic,Worst,1.0,0.449511,0.295775,1.0,0.440834,0.817865,0.991175,0.445195,0.998967,0.542225,0.833333,0.869565,0.435985
Genomic,Best,1.0,0.504297,0.980555,1.0,0.44077,0.921842,0.999047,1.0,0.999157,0.784744,1.0,1.0,1.0
Genomic,Worst,1.0,0.265068,0.0,1.0,0.144445,0.0,0.920547,0.03043,0.952938,0.054881,0.0,0.0,0.004759


ERR2935805



invalid value encountered in double_scalars



Unnamed: 0_level_0,Unnamed: 1_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Metagenomic,Best,1.0,0.899033,0.81289,1.0,0.60726,0.879376,1.0,0.821355,1.0,0.906483,1.0,1.0,0.32018
Metagenomic,Worst,1.0,0.553417,0.274259,1.0,0.448881,0.321464,0.993225,0.342501,0.994198,0.640566,0.994755,0.993902,0.014302
Genomic,Best,1.0,0.953816,0.994339,1.0,0.633214,0.971194,0.977351,1.0,0.979205,1.0,1.0,1.0,1.0
Genomic,Worst,1.0,0.284967,0.0,1.0,0.187466,0.0,0.647239,0.008362,0.874745,0.034394,0.0,0.0,0.005571


LHS



invalid value encountered in double_scalars



Unnamed: 0_level_0,Unnamed: 1_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Metagenomic,Best,1.0,0.457476,0.97529,1.0,0.441822,0.879423,1.0,0.560601,1.0,1.0,1.0,1.0,0.550545
Metagenomic,Worst,1.0,0.440707,0.932378,1.0,0.436483,0.500655,0.996112,0.261296,0.998812,0.688889,0.6,0.4,0.259202
Genomic,Best,1.0,0.523954,0.99884,1.0,0.430278,0.986239,0.992006,1.0,0.996793,0.923555,1.0,1.0,1.0
Genomic,Worst,1.0,0.095554,0.0,1.0,0.095338,0.0,0.793158,0.197594,0.948009,0.28728,0.0,0.0,0.132406


EMS



invalid value encountered in double_scalars



Unnamed: 0_level_0,Unnamed: 1_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Metagenomic,Best,1.0,0.988772,0.710426,1.0,0.982943,0.708931,1.0,0.852761,1.0,0.703845,0.923077,0.954545,0.852761
Metagenomic,Worst,1.0,0.982626,0.304033,1.0,0.974128,0.0,0.986332,0.257817,0.99585,0.359178,0.0,0.0,0.251802
Genomic,Best,1.0,0.990897,0.750381,1.0,0.980877,0.733652,0.99631,1.0,0.996434,1.0,1.0,1.0,1.0
Genomic,Worst,1.0,0.977463,0.0,1.0,0.971507,0.289474,0.985707,0.302542,0.988888,0.484719,0.307692,0.181818,0.301322


ERR2984773



invalid value encountered in double_scalars



Unnamed: 0_level_0,Unnamed: 1_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Metagenomic,Best,1.0,0.939815,0.912449,1.0,0.98765,0.713263,1.0,0.78277,1.0,1.0,0.933333,0.9375,0.533449
Metagenomic,Worst,1.0,0.41777,0.0,1.0,0.965257,0.0,0.981581,0.152196,0.985986,0.25155,0.0,0.0,0.109668
Genomic,Best,1.0,0.988573,0.985481,1.0,0.983375,0.839864,0.985069,1.0,0.998557,0.731592,1.0,1.0,1.0
Genomic,Worst,1.0,0.49625,0.354049,1.0,0.949777,0.227958,0.971916,0.406793,0.963571,0.370002,0.4,0.4375,0.278398


ENN



invalid value encountered in double_scalars



Unnamed: 0_level_0,Unnamed: 1_level_0,Ns,basepairs,contigs,filtered_Ns,filtered_basepairs,filtered_contigs,filtered_mapped_reads,filtered_n50,mapped_reads,max_contig,misassembled contigs,misassembly events,n50
Assembler Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Metagenomic,Best,1.0,0.987037,0.724376,1.0,0.982993,0.792005,1.0,1.0,1.0,1.0,0.888889,0.833333,1.0
Metagenomic,Worst,1.0,0.982033,0.215332,1.0,0.973626,0.25898,0.985257,0.272863,0.996453,0.564747,0.0,0.0,0.267066
Genomic,Best,1.0,0.983718,0.596899,1.0,0.978338,0.743337,0.991681,0.875706,0.997248,0.819851,1.0,1.0,0.856964
Genomic,Worst,1.0,0.972991,0.0,1.0,0.964084,0.0,0.980243,0.180914,0.986153,0.607926,0.444444,0.472222,0.176501


In [35]:
COLOURS = ['#930001','#C97F80','#009392','#7FC9C8']
for sample, df in rank_dfs:

    categories = ['contigs','filtered_contigs','basepairs','filtered_basepairs', 'max_contig', 'n50','filtered_n50', 'mapped_reads','filtered_mapped_reads', 'Ns','filtered_Ns','misassembled contigs', 'misassembly events']
    categories_original =  ['contigs','basepairs', 'max_contig', 'n50', 'mapped_reads', 'Ns'] 
    categories_original_renamed =  ['Contigs','Basepairs', 'Largest contig', 'N50', '% Mapped reads', 'Uncalled bases'] 
    categories_filtered = ['filtered_contigs', 'filtered_basepairs', 'filtered_n50', 'filtered_mapped_reads','filtered_Ns', 'misassembled contigs', 'misassembly events']
    categories_filtered_renamed = ['Contigs', 'Basepairs', 'N50', '% Mapped reads','Uncalled bases', 'Misassembled contigs', 'Misassembly events']

    fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'polar'}, {'type': 'polar'}]], subplot_titles=('Original', 'Filtered'))
    
    i= 0
    
    for index, row in df.iterrows():
        # BEST
        assembler = index[0]
        assembly_type = index[1]
        
        row_filtered = []
        row_original = []
        
        for col in categories_original:
            row_original.append(row.loc[:].at[col])
            
        for col in categories_filtered:
            row_filtered.append(row.loc[:].at[col])
        
        if assembly_type == 'Best':
            fig.add_trace(go.Scatterpolar(r=row_original,
                                          theta=categories_original_renamed, mode='lines+markers',
                                          marker=dict(color=COLOURS[i], size=12), 
                                          marker_line_color="black", 
                                          marker_line_width=2,
                                          opacity=0.6,
                                          name="{} - {}".format(assembler, assembly_type), line=dict(color=COLOURS[i]), showlegend=True), col=1, row=1)

            fig.add_trace(go.Scatterpolar(r=row_filtered,
                                          theta=categories_filtered_renamed, mode='lines+markers',
                                          marker=dict(color=COLOURS[i], size=12), 
                                          marker_line_color="black", 
                                          marker_line_width=2,
                                          opacity=0.6,
                                          name="{} - {}".format(assembler, assembly_type), line=dict(color=COLOURS[i]), showlegend=False), col=2, row=1)
        else:
            fig.add_trace(go.Scatterpolar(r=row_original,
                                          theta=categories_original_renamed, mode='lines+markers',
                                          marker=dict(color=COLOURS[i], size=12), 
                                          marker_line_color="black", 
                                          marker_line_width=2,
                                          opacity=0.6,
                                          name="{} - {}".format(assembler, assembly_type), line=dict(color=COLOURS[i]), showlegend=True), col=1, row=1)

            fig.add_trace(go.Scatterpolar(r=row_filtered,
                                          theta=categories_filtered_renamed, mode='lines+markers',
                                          marker=dict(color=COLOURS[i], size=12), 
                                          marker_line_color="black", 
                                          marker_line_width=2,
                                          opacity=0.6,
                                          name="{} - {}".format(assembler, assembly_type), line=dict(color=COLOURS[i]), showlegend=False), col=2, row=1)
        i+=1
            
    fig.update_layout(polar=dict(radialaxis=dict(visible=True,
                                                 range=[0,1],
                                                 linewidth = 2,
                                                 linecolor="black",
                                                 gridcolor = "#DCDCDC"), 
                                 hole=1/12, bgcolor='rgb(255,255,255)',
                                angularaxis=dict(linecolor="black"),
                                angularaxis_categoryarray = categories_original_renamed,
                                radialaxis_angle = -22.5),
                     polar2=dict(radialaxis=dict(visible=True,
                                                 range=[0,1],
                                                 linewidth = 2,
                                                 linecolor="black",
                                                 gridcolor = "#DCDCDC"), 
                                 hole=1/12, bgcolor='rgb(255,255,255)',
                                angularaxis=dict(linecolor="black"),
                                angularaxis_categoryarray = categories_filtered_renamed,
                                radialaxis_angle = -22.5))
    fig.update_layout(title=sample)
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.5,
        xanchor="left",
        x=0
    ))

    fig.show()
    plot(fig, filename='Plots/Global Metrics/Assembly Type - {}.html'.format(sample), auto_open=False)

