# 9 Plasmid Recovery

## Imports

In [1]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np

## Global Variables

In [4]:
ASSEMBLER_PROCESS_LIST = ["ABYSS", "BCALM2", "GATBMINIAPIPELINE", "METAHIPMER2", "MINIA", "MEGAHIT", "METASPADES", "UNICYCLER", "SPADES",
                          "SKESA", "VELVETOPTIMIZER", "IDBA"]
PROCESS_TO_NAME = {"ABYSS": "AbYSS",
                   "BCALM2": "BCALM2", 
                   "GATBMINIAPIPELINE": "GATBMiniaPipeline",
                   "METAHIPMER2": "MetaHipMer2",
                   "MINIA": "MINIA",
                   "MEGAHIT": "MEGAHIT", 
                   "METASPADES": "metaSPAdes", 
                   "UNICYCLER": "Unicycler", 
                   "SPADES": "SPAdes",
                    "SKESA": "SKESA",
                   "VELVETOPTIMIZER": "VelvetOptimizer",
                   "IDBA": "IDBA-UD"}

log_distributed = ['LHS', 'LNN', 'ERR2935805']
even_distribution = ['EMS', 'LNN', 'ERR2984773']

genomic_assemblers = ['ABySS', 'BCALM2', 'MINIA', 'SKESA', 'SPAdes', 'Unicycler', 'VelvetOptimizer']
metagenomic_assemblers = ['MetaHipMer2','GATBMiniaPipeline', 'IDBA-UD', 'MEGAHIT', 'metaSPAdes']
single_kmer = ['BCALM2', 'MINIA', 'ABySS']
multiple_kmer = ['SKESA', 'SPAdes', 'Unicycler', 'VelvetOptimizer', 'GATBMiniaPipeline', 
                 'IDBA-UD', 'MEGAHIT', 'metaSPAdes', 'MetaHipMer2']

skipped_assemblers = ['ABySS', 'BCALM2', 'MINIA', 'VelvetOptimiser', 'MetaHipMer2']

REFERENCE_TO_NAME = {"Bacillus_subtilis":"<i>Bacillus subtilis</i>",
                    "Enterococcus_faecalis":"<i>Enterococcus faecalis</i>",
                    "Escherichia_coli":"<i>Escherichia coli</i>",
                    "Escherichia_coli_plasmid": "<i>Escherichia coli</i> pasmid",
                    "Lactobacillus_fermentum": "<i>Lactobacillus fermentum</i>",
                    "Listeria_monocytogenes": "<i>Listeria monocytogenes</i>",
                    "Pseudomonas_aeruginosa": "<i>Pseudomonas aeruginosa</i>",
                    "Salmonella_enterica": "<i>Salmonella enterica</i>",
                    "Staphylococcus_aureus": "<i>Staphylococcus aureus</i>",
                    "Staphylococcus_aureus_plasmid1": "<i>Staphylococcus aureus</i> plasmid 1",
                    "Staphylococcus_aureus_plasmid2": "<i>Staphylococcus aureus</i> plasmid 2",
                    "Staphylococcus_aureus_plasmid3": "<i>Staphylococcus aureus</i> pasmid 3"}

plasmid_reference = ["<i>Escherichia coli</i> pasmid",
                     "<i>Staphylococcus aureus</i> plasmid 1",
                    "<i>Staphylococcus aureus</i> plasmid 2",
                    "<i>Staphylococcus aureus</i> pasmid 3"]

best_min = ['Ns','misassembled contigs','misassembly events']
best_min_exept_0 = ['contigs', 'L90'] 
best_max = ['LSA', 'NA50','NG50','breadth_of_coverage','identity','lowest_identity']

target_compass = {'multiplicity': 1,'parsimony':1, 'validity':1}
    
target_dict = {
    'Bacillus_subtilis': 4045677,
    'Enterococcus_faecalis': 2845392,
    'Escherichia_coli': 4765434,
    'Escherichia_coli_plasmid': 110007,
    'Lactobacillus_fermentum': 1905333,
    'Listeria_monocytogenes': 2992342,
    'Pseudomonas_aeruginosa': 6792330,
    'Salmonella_enterica': 4759746,
    'Staphylococcus_aureus': 2718780,
    'Staphylococcus_aureus_plasmid1': 6337,
    'Staphylococcus_aureus_plasmid2': 2216,
    'Staphylococcus_aureus_plasmid3': 2993
}

COLOURS = ['#5876c8', '#58AEC8', '#39B185', '#9CCB86', '#EEB479', '#E88471', '#a54765', '#a42a2a', '#835221']

## Load data

In [5]:
report_glob = glob.glob('../Results/*/*/report/pipeline_report_tables.json')
reference_pipeline_metrics_df = pd.DataFrame()

for pipeline_report_file in report_glob:
    report_file_name = pipeline_report_file.split('/')[-1]
    stats_run = pipeline_report_file.split('/')[-3]
    print('Processing {0} data from {1}...'.format(report_file_name, stats_run))
    
    with open(pipeline_report_file) as _fh:
        json_report = json.load(_fh)
        for sample in json_report.keys():
            for reference, data in json_report[sample]['ReferenceTables'].items():
                for row in data:
                    for item in row:
                        reference_pipeline_metrics_df = reference_pipeline_metrics_df.append({'run': stats_run,
                                                                                        'sample': sample,
                                                                                        'assembler': item['assembler'],
                                                                                        'reference': REFERENCE_TO_NAME[reference],
                                                                                        'LSA': item['contiguity'],
                                                                                        'breadth_of_coverage': item['breadth_of_coverage'],
                                                                                        'multiplicity': item['multiplicity'],
                                                                                        'validity': item['validity'],
                                                                                        'parsimony': item['parsimony'],
                                                                                        'identity': item['identity'],
                                                                                        'lowest_identity': item['lowest_identity'],
                                                                                        'L90': item['L90'],
                                                                                        'contigs': item['aligned_contigs'],
                                                                                        'NA50': item['NA50'],
                                                                                        'NG50': item['NG50'],
                                                                                        'basepairs': item['aligned_basepairs'],
                                                                                        'Ns': item['Ns'],
                                                                                        'misassembled contigs': item['misassembled_contigs'],
                                                                                        'misassembly events': item['misassembly_events']},
                                                                                       ignore_index=True)

reference_pipeline_metrics_df['distribution'] = np.where(reference_pipeline_metrics_df['sample'].isin(log_distributed), 'Log', 'Even')
reference_pipeline_metrics_df = reference_pipeline_metrics_df[reference_pipeline_metrics_df.distribution != 'Log']
reference_pipeline_metrics_df['type'] = np.where(reference_pipeline_metrics_df['assembler'].isin(genomic_assemblers), 'Genomic', 'Metagenomic')
reference_pipeline_metrics_df['genome'] = np.where(reference_pipeline_metrics_df['reference'].isin(plasmid_reference), 'Plasmid', 'Genome')
reference_pipeline_metrics_df['algorythm'] = np.where(reference_pipeline_metrics_df['assembler'].isin(single_kmer), 'Single k-mer De Bruijn graph', 'Multiple k-mer De Bruijn graph')
reference_pipeline_metrics_df[['contigs','basepairs','L90','Ns','NA50','NG50','misassembled contigs', 'misassembly events', 'multiplicity','validity','parsimony','identity','lowest_identity']] = reference_pipeline_metrics_df[['contigs','basepairs','L90','Ns','NA50','NG50','misassembled contigs', 'misassembly events','multiplicity','validity','parsimony','identity','lowest_identity']].apply(pd.to_numeric)
display(reference_pipeline_metrics_df)

Processing pipeline_report_tables.json data from run3...
Processing pipeline_report_tables.json data from run2...
Processing pipeline_report_tables.json data from run1...
Processing pipeline_report_tables.json data from run3...
Processing pipeline_report_tables.json data from run2...


Unnamed: 0,L90,LSA,NA50,NG50,Ns,assembler,basepairs,breadth_of_coverage,contigs,identity,...,multiplicity,parsimony,reference,run,sample,validity,distribution,type,genome,algorythm
864,13.0,0.106260,295233.0,295233.0,0.0,ABySS,3989374.0,0.986083,22.0,0.999910,...,1.000305,1.000305,<i>Bacillus subtilis</i>,run1,EMS,1.000000,Even,Genomic,Genome,Single k-mer De Bruijn graph
865,0.0,0.002419,2062.0,1589.0,0.0,BCALM2,2844464.0,0.703087,1467.0,1.000000,...,1.004273,1.004273,<i>Bacillus subtilis</i>,run1,EMS,1.000000,Even,Genomic,Genome,Single k-mer De Bruijn graph
866,13.0,0.189977,263445.0,263445.0,0.0,GATBMiniaPipeline,3984103.0,0.984780,27.0,0.999864,...,1.000073,1.000094,<i>Bacillus subtilis</i>,run1,EMS,0.999979,Even,Metagenomic,Genome,Multiple k-mer De Bruijn graph
867,18.0,0.084760,212969.0,212969.0,0.0,IDBA-UD,3977845.0,0.983233,34.0,0.997696,...,1.000061,1.000084,<i>Bacillus subtilis</i>,run1,EMS,0.999977,Even,Metagenomic,Genome,Multiple k-mer De Bruijn graph
868,13.0,0.189990,263586.0,263586.0,0.0,MEGAHIT,3986471.0,0.985366,27.0,0.999421,...,0.999952,1.000269,<i>Bacillus subtilis</i>,run1,EMS,0.999683,Even,Metagenomic,Genome,Multiple k-mer De Bruijn graph
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2587,0.0,0.000000,0.0,0.0,0.0,MINIA,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,<i>Staphylococcus aureus</i> pasmid 3,run2,EMS,0.000000,Even,Genomic,Plasmid,Single k-mer De Bruijn graph
2588,0.0,0.908119,2718.0,2718.0,0.0,SKESA,2718.0,0.908119,1.0,1.000000,...,1.000000,1.000000,<i>Staphylococcus aureus</i> pasmid 3,run2,EMS,1.000000,Even,Genomic,Plasmid,Multiple k-mer De Bruijn graph
2589,0.0,0.999666,2992.0,2992.0,0.0,SPAdes,2992.0,0.999666,1.0,1.000000,...,1.000000,1.000000,<i>Staphylococcus aureus</i> pasmid 3,run2,EMS,1.000000,Even,Genomic,Plasmid,Multiple k-mer De Bruijn graph
2590,0.0,0.964250,2886.0,2886.0,0.0,Unicycler,2886.0,0.964250,1.0,1.000000,...,1.000000,1.000000,<i>Staphylococcus aureus</i> pasmid 3,run2,EMS,1.000000,Even,Genomic,Plasmid,Multiple k-mer De Bruijn graph


## Global statistics per genome type

In [7]:
print(reference_pipeline_metrics_df['reference'][reference_pipeline_metrics_df['genome'] == "Genome"].unique())
print(reference_pipeline_metrics_df['reference'][reference_pipeline_metrics_df['genome'] == "Plasmid"].unique())

['<i>Bacillus subtilis</i>' '<i>Enterococcus faecalis</i>'
 '<i>Escherichia coli</i>' '<i>Lactobacillus fermentum</i>'
 '<i>Listeria monocytogenes</i>' '<i>Pseudomonas aeruginosa</i>'
 '<i>Salmonella enterica</i>' '<i>Staphylococcus aureus</i>']
['<i>Escherichia coli</i> pasmid' '<i>Staphylococcus aureus</i> plasmid 1'
 '<i>Staphylococcus aureus</i> plasmid 2'
 '<i>Staphylococcus aureus</i> pasmid 3']


In [9]:
for to_skip in skipped_assemblers:
    reference_pipeline_metrics_df = reference_pipeline_metrics_df.drop(reference_pipeline_metrics_df.loc[reference_pipeline_metrics_df['assembler']==to_skip].index)

In [19]:
for sample in reference_pipeline_metrics_df['sample'].unique():
    print(sample)
    stats_per_assembler_type = pd.DataFrame()
    for assembly_type in reference_pipeline_metrics_df.genome.unique():
        row={'Genome Type': assembly_type}
        lala=reference_pipeline_metrics_df[(reference_pipeline_metrics_df['sample'] == sample) & (reference_pipeline_metrics_df.genome == assembly_type)].describe()
        for column in lala.columns:
            mean = lala.loc['mean',column]
            minimum = lala.loc['min',column]
            maximum = lala.loc['max',column]
            row[column] = "{} [{};{}]".format(round(mean, 2), round(minimum,2), round(maximum,2))
        stats_per_assembler_type=stats_per_assembler_type.append(row, ignore_index=True)
    stats_per_assembler_type = stats_per_assembler_type.set_index('Genome Type')
    display(stats_per_assembler_type)
    stats_per_assembler_type.to_csv("Tables/Results/Global metrics per genome type - {}.csv".format(sample))

EMS


Unnamed: 0_level_0,L90,LSA,NA50,NG50,Ns,basepairs,breadth_of_coverage,contigs,identity,lowest_identity,misassembled contigs,misassembly events,multiplicity,parsimony,validity
Genome Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Genome,40.88 [5.0;283.0],0.13 [0.02;0.48],183948.05 [19013.0;475510.0],182738.27 [18446.0;475510.0],0.0 [0.0;0.0],3769457.38 [1775427.0;6731626.0],0.98 [0.93;0.99],78.48 [13.0;439.0],1.0 [0.98;1.0],0.92 [0.25;1.0],0.7 [0.0;9.0],1.25 [0.0;14.0],1.0 [1.0;1.01],1.0 [1.0;1.02],1.0 [0.98;1.0]
Plasmid,0.71 [0.0;7.0],0.76 [0.0;1.0],7358.0 [0.0;28353.0],7307.86 [0.0;28353.0],0.0 [0.0;0.0],23248.11 [0.0;107718.0],0.9 [0.0;1.0],2.71 [0.0;26.0],0.96 [0.0;1.0],0.96 [0.0;1.0],0.0 [0.0;0.0],0.0 [0.0;0.0],0.97 [0.0;1.02],0.97 [0.0;1.02],0.96 [0.0;1.0]


ERR2984773


Unnamed: 0_level_0,L90,LSA,NA50,NG50,Ns,basepairs,breadth_of_coverage,contigs,identity,lowest_identity,misassembled contigs,misassembly events,multiplicity,parsimony,validity
Genome Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Genome,54.16 [6.0;215.0],0.1 [0.02;0.4],142136.04 [16131.0;542872.0],136656.2 [15599.0;542872.0],0.0 [0.0;0.0],3767728.38 [1757511.0;6737986.0],0.97 [0.92;0.99],98.11 [17.0;358.0],1.0 [0.98;1.0],0.92 [0.42;1.0],0.62 [0.0;4.0],1.21 [0.0;8.0],1.0 [1.0;1.04],1.0 [1.0;1.04],1.0 [0.99;1.0]
Plasmid,0.54 [0.0;4.0],0.74 [0.0;1.29],12248.83 [0.0;55277.0],12175.08 [0.0;55277.0],0.0 [0.0;0.0],29559.5 [0.0;109516.0],0.89 [0.0;1.0],1.68 [0.0;7.0],0.93 [0.0;1.0],0.92 [0.0;1.0],0.0 [0.0;0.0],0.0 [0.0;0.0],0.95 [0.0;1.29],0.96 [0.0;1.29],0.93 [0.0;1.0]


ENN


Unnamed: 0_level_0,L90,LSA,NA50,NG50,Ns,basepairs,breadth_of_coverage,contigs,identity,lowest_identity,misassembled contigs,misassembly events,multiplicity,parsimony,validity
Genome Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Genome,53.85 [8.0;670.0],0.12 [0.01;0.38],169760.11 [10114.0;447841.0],167684.43 [9897.0;447841.0],0.0 [0.0;0.0],3760224.15 [1774634.0;6730265.0],0.97 [0.93;0.99],99.08 [16.0;933.0],1.0 [0.99;1.0],0.92 [0.22;1.0],0.9 [0.0;8.0],1.71 [0.0;15.0],1.0 [1.0;1.01],1.0 [1.0;1.02],1.0 [0.99;1.0]
Plasmid,0.96 [0.0;8.0],0.68 [0.0;1.0],5003.07 [0.0;15523.0],4629.75 [0.0;12670.0],0.0 [0.0;0.0],21957.96 [0.0;107736.0],0.84 [0.0;1.0],3.39 [0.0;24.0],0.92 [0.0;1.0],0.92 [0.0;1.0],0.0 [0.0;0.0],0.0 [0.0;0.0],0.93 [0.0;1.02],0.95 [0.0;1.46],0.92 [0.0;1.0]


In [34]:
best_stats_dfs = []

all_data = pd.DataFrame()

for sample in reference_pipeline_metrics_df['sample'].unique():
    print(sample)
    best_stats_per_assembler = pd.DataFrame()

    for assembly_type in reference_pipeline_metrics_df.genome.unique():
        
        best_row = {'Genome Type': assembly_type, "Type": "Best"}
        worst_row = {'Genome Type': assembly_type, "Type": "Worst"}
        
        describe_df = reference_pipeline_metrics_df[(reference_pipeline_metrics_df['sample'] == sample) & (reference_pipeline_metrics_df.genome == assembly_type)].describe()

        for column in describe_df.columns:
            mean = describe_df.loc['mean',column]
            minimum = describe_df.loc['min',column]
            maximum = describe_df.loc['max',column]
                        
            if "basepairs" in column:
                    target = target_dict[reference]
                    best_row[column] = min(list(reference_pipeline_metrics_df[column][(reference_pipeline_metrics_df['genome'] == assembly_type) & (reference_pipeline_metrics_df['sample'] == sample)]), key=lambda x:abs(x-target))
                    worst_row[column] = max(list(reference_pipeline_metrics_df[column][(reference_pipeline_metrics_df['genome'] == assembly_type) & (reference_pipeline_metrics_df['sample'] == sample)]), key=lambda x:abs(x-target))

            elif column in target_compass.keys():
                    target=1
                    best_row[column] = min(list(reference_pipeline_metrics_df[column][(reference_pipeline_metrics_df['genome'] == assembly_type) & (reference_pipeline_metrics_df['sample'] == sample)]), key=lambda x:abs(x-target))
                    worst_row[column] = max(list(reference_pipeline_metrics_df[column][(reference_pipeline_metrics_df['genome'] == assembly_type) & (reference_pipeline_metrics_df['sample'] == sample)]), key=lambda x:abs(x-target))

            elif column in best_min_exept_0:
                temp = [x for x in list(reference_pipeline_metrics_df[column][(reference_pipeline_metrics_df['genome'] == assembly_type) & (reference_pipeline_metrics_df['sample'] == sample)]) if x != 0]
                if len(temp) > 0:
                    best_row[column] = min(temp)
                    worst_row[column] = max(temp)
                else:
                    best_row[column] = 0
                    worst_row[column] = 0
            elif column in best_min:
                best_row[column] = minimum
                worst_row[column] = maximum
            elif column in best_max:
                best_row[column] = maximum
                worst_row[column] = minimum
                
        best_stats_per_assembler=best_stats_per_assembler.append(best_row, ignore_index=True)
        best_stats_per_assembler=best_stats_per_assembler.append(worst_row, ignore_index=True)
        
    best_stats_per_assembler = best_stats_per_assembler.set_index(['Genome Type', 'Type'])
    best_stats_dfs.append([sample, best_stats_per_assembler])


EMS
ERR2984773
ENN


In [44]:
rank_dfs = []

for sample, df  in best_stats_dfs:
    
    rank_df = pd.DataFrame(index=df.index)
    
    for column in df.columns:
        rank_list = []
        
        if "basepairs" in column:
            target = target_dict[reference]
            for item in df[column]:
                diff = 1 - abs(item-target) / target
                rank_list.append(diff)
            rank_df[column] = rank_list

        elif column in target_compass:
            target = 1
            for item in df[column]:
                diff = 1- abs(item-target) / target
                rank_list.append(diff)
            rank_df[column] = rank_list

        elif column in best_min_exept_0:
            for item in df[column]:
                if item == 0:
                    diff=0
                else:
                    diff = 1 - (item / df[column].max())
                rank_list.append(diff)

        elif column in best_min:
            for item in df[column]:
                diff = 1 - (item / df[column].max())
                rank_list.append(diff)
        else:
            for item in df[column]:
                diff = (item / df[column].max())
                rank_list.append(diff)

        rank_df[column] = rank_list

    # in case assembly fails
    for assembler, row in df.iterrows():
        if row.sum() == 0:
            rank_df.at[assembler, :]=0

    rank_df = rank_df.fillna(0) #happens when all values are 0
    display(rank_df) 
    rank_dfs.append([sample, rank_df])


invalid value encountered in double_scalars



Unnamed: 0_level_0,Unnamed: 1_level_0,L90,LSA,NA50,NG50,Ns,basepairs,breadth_of_coverage,contigs,identity,lowest_identity,misassembled contigs,misassembly events,multiplicity,parsimony,validity
Genome Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Genome,Best,0.982332,0.479754,1.0,1.0,0.0,-591.193117,0.992228,0.970387,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Genome,Worst,0.0,0.018884,0.039984,0.038792,0.0,-2247.123288,0.931967,0.0,0.984162,0.250734,0.0,0.0,0.993701,0.976641,0.977174
Plasmid,Best,0.996466,1.0,0.059627,0.059627,0.0,0.999666,1.0,0.997722,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Plasmid,Worst,0.975265,0.0,0.0,0.0,0.0,-33.989977,0.0,0.940774,0.0,0.0,1.0,1.0,0.0,0.0,0.0



invalid value encountered in double_scalars



Unnamed: 0_level_0,Unnamed: 1_level_0,L90,LSA,NA50,NG50,Ns,basepairs,breadth_of_coverage,contigs,identity,lowest_identity,misassembled contigs,misassembly events,multiplicity,parsimony,validity
Genome Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Genome,Best,0.972093,0.312936,1.0,1.0,0.0,-585.20715,0.991999,0.952514,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Genome,Worst,0.0,0.013281,0.029714,0.028734,0.0,-2249.248246,0.922417,0.0,0.978636,0.420899,0.0,0.0,0.963093,0.963089,0.986844
Plasmid,Best,0.995349,1.0,0.101823,0.101823,0.0,1.0,1.0,0.997207,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Plasmid,Worst,0.981395,0.0,0.0,0.0,0.0,-34.590712,0.0,0.980447,0.0,0.0,1.0,1.0,0.0,0.0,0.0



invalid value encountered in double_scalars



Unnamed: 0_level_0,Unnamed: 1_level_0,L90,LSA,NA50,NG50,Ns,basepairs,breadth_of_coverage,contigs,identity,lowest_identity,misassembled contigs,misassembly events,multiplicity,parsimony,validity
Genome Type,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Genome,Best,0.98806,0.381399,1.0,1.0,0.0,-590.928166,0.99156,0.982851,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Genome,Worst,0.0,0.006355,0.022584,0.022099,0.0,-2246.66856,0.931404,0.0,0.992404,0.221785,0.0,0.0,0.991311,0.978105,0.987077
Plasmid,Best,0.998507,1.0,0.034662,0.028291,0.0,0.999666,1.0,0.998928,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Plasmid,Worst,0.98806,0.0,0.0,0.0,0.0,-33.995991,0.0,0.974277,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [70]:
COLOURS = ['#930001','#C97F80','#009392','#7FC9C8']

fig = make_subplots(rows=3, cols=1, specs=[[{'type': 'polar'}],[{'type': 'polar'}], [{'type': 'polar'}]], subplot_titles=([x[0] for x in rank_dfs]))

j=0

for sample, df in rank_dfs:
    showlegend=True if j==1 else False
    i= 0
    j+=1

    for index, row in df.iterrows():
        # BEST
        genomt_type = index[0]
        category = index[1]
        
        row_data = []

        for col in df.columns:
            row_data.append(row.loc[:].at[col])
        
        if category == 'Best':
            fig.add_trace(go.Scatterpolar(r=row_data,
                                      theta=list([z.replace('_',' ') for z in df.columns]), mode='lines+markers',
                                      marker=dict(color=COLOURS[i], size=12), 
                                      marker_line_color="black", 
                                      marker_line_width=2,
                                      opacity=0.6,
                                      name="{} - {}".format(genomt_type, category), line=dict(color=COLOURS[i]), showlegend=showlegend),
                         col=1, row=j)

            
        else:
            fig.add_trace(go.Scatterpolar(r=row_data,
                                      theta=list([z.replace('_',' ') for z in df.columns]), mode='lines+markers',
                                      marker=dict(color=COLOURS[i], size=12), 
                                      marker_line_color="black", 
                                      marker_line_width=2,
                                      opacity=0.6,
                                      name="{} - {}".format(genomt_type, category), line=dict(color=COLOURS[i]), showlegend=showlegend),
                         col=1, row=j)

            
        i+=1
            
fig.update_layout(polar=dict(radialaxis=dict(visible=True,
                                             range=[0,1],
                                             linewidth = 2,
                                             linecolor="black",
                                             gridcolor = "#DCDCDC"), 
                             hole=1/12, bgcolor='rgb(255,255,255)',
                            angularaxis=dict(linecolor="black"),
                            radialaxis_angle = -22.5),
                 polar2=dict(radialaxis=dict(visible=True,
                                             range=[0,1],
                                             linewidth = 2,
                                             linecolor="black",
                                             gridcolor = "#DCDCDC"), 
                             hole=1/12, bgcolor='rgb(255,255,255)',
                            angularaxis=dict(linecolor="black"),
                            radialaxis_angle = -22.5),
                 polar3=dict(radialaxis=dict(visible=True,
                                             range=[0,1],
                                             linewidth = 2,
                                             linecolor="black",
                                             gridcolor = "#DCDCDC"), 
                             hole=1/12, bgcolor='rgb(255,255,255)',
                            angularaxis=dict(linecolor="black"),
                            radialaxis_angle = -22.5))

fig.layout.annotations[0].update(y=1.05, font=dict(size=20,color="black"))
fig.layout.annotations[1].update(y=0.65, font=dict(size=20,color="black"))
fig.layout.annotations[2].update(y=0.25, font=dict(size=20,color="black"))


fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=-0.1,
    xanchor="left",
    x=0.2
))

fig.show()
plot(fig, filename='Plots/Reference Metrics/Genomic Type - all.html', auto_open=False)



'Plots/Reference Metrics/Genomic Type - all.html'