# 5. LMAS PLS Metrics Analysis

In [15]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np

In [45]:
ASSEMBLER_PROCESS_LIST = ["ABYSS", "BCALM2", "GATBMINIAPIPELINE", "METAHIPMER2", "MINIA", "MEGAHIT", "METASPADES", "UNICYCLER", "SPADES",
                          "SKESA", "VELVETOPTIMISER", "IDBA"]

PROCESS_TO_NAME = {"ABYSS": "AbYSS",
                   "BCALM2": "BCALM2", 
                   "GATBMINIAPIPELINE": "GATBMiniaPipeline",
                   "METAHIPMER2": "MetaHipMer2",
                   "MINIA": "MINIA",
                   "MEGAHIT": "MEGAHIT", 
                   "METASPADES": "metaSPAdes", 
                   "UNICYCLER": "Unicycler", 
                   "SPADES": "SPAdes",
                    "SKESA": "SKESA",
                   "VELVETOPTIMISER": "VelvetOptimiser",
                   "IDBA": "IDBA-UD"}

log_distributed = ['LHS', 'LNN', 'ERR2935805']
even_distribution = ['EMS', 'LNN', 'ERR2984773']

skipped_assemblers = ['ABySS', 'BCALM2', 'MINIA', 'VelvetOptimiser', 'MetaHipMer2']

REFERENCE_TO_NAME = {"Bacillus_subtilis":"<i>Bacillus subtilis</i>",
                    "Enterococcus_faecalis":"<i>Enterococcus faecalis</i>",
                    "Escherichia_coli":"<i>Escherichia coli</i>",
                    "Escherichia_coli_plasmid": np.nan,
                    "Lactobacillus_fermentum": "<i>Lactobacillus fermentum</i>",
                    "Listeria_monocytogenes": "<i>Listeria monocytogenes</i>",
                    "Pseudomonas_aeruginosa": "<i>Pseudomonas aeruginosa</i>",
                    "Salmonella_enterica": "<i>Salmonella enterica</i>",
                    "Staphylococcus_aureus": "<i>Staphylococcus aureus</i>",
                    "Staphylococcus_aureus_plasmid1": np.nan,
                    "Staphylococcus_aureus_plasmid2": np.nan,
                    "Staphylococcus_aureus_plasmid3": np.nan}

genomic_assemblers = ['ABySS', 'BCALM2', 'MINIA', 'SKESA', 'SPAdes', 'Unicycler', 'VelvetOptimiser']
metagenomic_assemblers = ['MetaHipMer2','GATBMiniaPipeline', 'IDBA-UD', 'MEGAHIT', 'metaSPAdes']
single_kmer = ['BCALM2', 'MINIA', 'ABySS']
multiple_kmer = ['SKESA', 'SPAdes', 'Unicycler', 'VelvetOptimizer', 'GATBMiniaPipeline', 
                 'IDBA-UD', 'MEGAHIT', 'metaSPAdes', 'MetaHipMer2']

best_min = ['Ns', 'contigs', 'filtered_Ns', 'filtered_contigs','misassembled contigs','misassembly events']
best_max = ['basepairs','filtered_basepairs','filtered_mapped_reads','filtered_n50','mapped_reads','max_contig','n50']

COLOURS = ['#5876c8', '#58AEC8', '#39B185', '#9CCB86', '#EEB479', '#E88471', '#a54765', '#a42a2a', '#835221']

In [47]:
PLS_files = glob.glob('../Results/*/*/results/*/stats/*phred.csv')
PLS_df = pd.DataFrame(columns=['run', 'Sample', 'Assembler','Reference','Contig','Contig Length','Phred Quality Score'])

for PLS_file in PLS_files:
    report_file_name = PLS_file.split('/')[-1]
    stats_run = PLS_file.split('/')[3]
    sample_name = report_file_name.split('_')[0]
    assembler_name = report_file_name.split('_')[1]
    if assembler_name not in skipped_assemblers:
        print('Processing {0} data from {1}...'.format(report_file_name, stats_run))

        with open(PLS_file) as tsvfile:
            tsvreader = csv.reader(tsvfile, delimiter=",")
            next(tsvreader, None)
            for row in tsvreader:
                    PLS_df = PLS_df.append({'run': stats_run,
                                            'Assembler': row[1],
                                            'Sample': sample_name,
                                            'Reference': REFERENCE_TO_NAME[row[2]],
                                            'Contig': row[3],
                                            'Contig Length': row[4],
                                            'Phred Quality Score': row[5]}, ignore_index=True)
display(PLS_df)

Processing LNN_Unicycler_phred.csv data from run3...
Processing LNN_GATBMiniaPipeline_phred.csv data from run3...
Processing LNN_SPAdes_phred.csv data from run3...
Processing LNN_SKESA_phred.csv data from run3...
Processing LNN_MEGAHIT_phred.csv data from run3...
Processing LNN_metaSPAdes_phred.csv data from run3...
Processing LNN_IDBA-UD_phred.csv data from run3...
Processing ERR2935805_GATBMiniaPipeline_phred.csv data from run3...
Processing ERR2935805_MEGAHIT_phred.csv data from run3...
Processing ERR2935805_metaSPAdes_phred.csv data from run3...
Processing ERR2935805_Unicycler_phred.csv data from run3...
Processing ERR2935805_IDBA-UD_phred.csv data from run3...
Processing ERR2935805_SPAdes_phred.csv data from run3...
Processing ERR2935805_SKESA_phred.csv data from run3...
Processing LHS_GATBMiniaPipeline_phred.csv data from run3...
Processing LHS_SKESA_phred.csv data from run3...
Processing LHS_Unicycler_phred.csv data from run3...
Processing LHS_SPAdes_phred.csv data from run3...


Unnamed: 0,run,Sample,Assembler,Reference,Contig,Contig Length,Phred Quality Score
0,run3,LNN,Unicycler,<i>Bacillus subtilis</i>,14,242687,60
1,run3,LNN,Unicycler,<i>Bacillus subtilis</i>,17,228676,60
2,run3,LNN,Unicycler,<i>Bacillus subtilis</i>,18,223740,60
3,run3,LNN,Unicycler,<i>Bacillus subtilis</i>,20,212812,60
4,run3,LNN,Unicycler,<i>Bacillus subtilis</i>,30,139230,60
...,...,...,...,...,...,...,...
81615,run2,ERR2984773,GATBMiniaPipeline,<i>Staphylococcus aureus</i>,5377,44765,60
81616,run2,ERR2984773,GATBMiniaPipeline,<i>Staphylococcus aureus</i>,5382,129676,60
81617,run2,ERR2984773,GATBMiniaPipeline,,1920,6515,23.367931653289368
81618,run2,ERR2984773,GATBMiniaPipeline,,681,2369,60


In [49]:
PLS_df.Reference.unique()

array(['<i>Bacillus subtilis</i>', '<i>Listeria monocytogenes</i>',
       '<i>Pseudomonas aeruginosa</i>', '<i>Escherichia coli</i>',
       '<i>Lactobacillus fermentum</i>', '<i>Salmonella enterica</i>',
       '<i>Enterococcus faecalis</i>', nan,
       '<i>Staphylococcus aureus</i>'], dtype=object)

In [50]:
PLS_df = PLS_df.dropna()
PLS_df = PLS_df.drop_duplicates(subset=['Contig Length', 'Phred Quality Score'])
PLS_df.Reference.unique()

array(['<i>Bacillus subtilis</i>', '<i>Listeria monocytogenes</i>',
       '<i>Pseudomonas aeruginosa</i>', '<i>Escherichia coli</i>',
       '<i>Lactobacillus fermentum</i>', '<i>Salmonella enterica</i>',
       '<i>Enterococcus faecalis</i>', '<i>Staphylococcus aureus</i>'],
      dtype=object)

In [51]:
PLS_df

Unnamed: 0,run,Sample,Assembler,Reference,Contig,Contig Length,Phred Quality Score
0,run3,LNN,Unicycler,<i>Bacillus subtilis</i>,14,242687,60
1,run3,LNN,Unicycler,<i>Bacillus subtilis</i>,17,228676,60
2,run3,LNN,Unicycler,<i>Bacillus subtilis</i>,18,223740,60
3,run3,LNN,Unicycler,<i>Bacillus subtilis</i>,20,212812,60
4,run3,LNN,Unicycler,<i>Bacillus subtilis</i>,30,139230,60
...,...,...,...,...,...,...,...
81611,run2,ERR2984773,GATBMiniaPipeline,<i>Staphylococcus aureus</i>,4997,35271,60
81612,run2,ERR2984773,GATBMiniaPipeline,<i>Staphylococcus aureus</i>,5298,143138,60
81613,run2,ERR2984773,GATBMiniaPipeline,<i>Staphylococcus aureus</i>,5372,51929,60
81614,run2,ERR2984773,GATBMiniaPipeline,<i>Staphylococcus aureus</i>,5375,52131,60


In [52]:
PLS_df[['Contig Length', 'Phred Quality Score']] = PLS_df[['Contig Length', 'Phred Quality Score']].apply(pd.to_numeric)

In [53]:
PLS_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19039 entries, 0 to 81616
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   run                  19039 non-null  object 
 1   Sample               19039 non-null  object 
 2   Assembler            19039 non-null  object 
 3   Reference            19039 non-null  object 
 4   Contig               19039 non-null  object 
 5   Contig Length        19039 non-null  int64  
 6   Phred Quality Score  19039 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 1.2+ MB


In [55]:
fig=make_subplots(rows=4, cols=2, subplot_titles=[x.replace('_', ' ') for x in sorted(PLS_df.Reference.unique(), key=lambda v: v.upper(), reverse=False)],
                 shared_yaxes=True, shared_xaxes=True, x_title="Contig size", y_title="Score")
row=1
col=1
showlegend = True if col == 1 and row == 1 else False
for reference in sorted(PLS_df.Reference.unique(), key=lambda v: v.upper(), reverse=False):
    i=0
    print(row, col)
    for assembler in sorted(PLS_df['Assembler'].unique(), key=lambda v: v.upper(), reverse=True):
        print('---' + assembler)
        showlegend=True if col==1 and row==1 else False

        fig.add_trace(go.Scatter(y=PLS_df['Phred Quality Score'][(PLS_df['Reference'] == reference) &
                                                                         (PLS_df['Assembler'] == assembler)],
                                          x=PLS_df['Contig Length'][(PLS_df['Reference'] == reference) &
                                                                     (PLS_df['Assembler'] == assembler)],
                                          showlegend=showlegend, name=assembler, mode="markers",
                                          marker=dict(color=COLOURS[i], size=12,opacity=0.5,
                                                                     line=dict(color='black',width=1))), 
                              row=row, col=col)
        i+=1
    if row == 4:
        row = 1
        col += 1
    else:
        row += 1

for i in fig['layout']['annotations']:
    i['font']['size'] = 12
fig.update_layout(legend=dict(
    orientation="h",
    y=-0.1,
    x=0
))

fig.update_layout(plot_bgcolor='rgb(255,255,255)')
fig.update_layout(title=dict(text='Pls Metric per contig<br><sup>ZymoBIOMICS Microbial Community Standard bacterial reference replicons</sup><br>',
                             x=0.5,
                             y=0.98,
                             xanchor='center',
                             yanchor='top'),
                 font=dict(size=18))
fig.update_xaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', rangemode='tozero')
fig.update_yaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', range=[0, 70])

for i in fig['layout']['annotations']:
    i['font']['size'] = 18

fig.update_layout(xaxis_showticklabels=True, xaxis2_showticklabels=True, xaxis3_showticklabels=True, xaxis4_showticklabels=True, xaxis5_showticklabels=True,  xaxis6_showticklabels=True,  xaxis7_showticklabels=True,  xaxis8_showticklabels=True)
fig.update_layout(yaxis_showticklabels=True, yaxis2_showticklabels=True, yaxis3_showticklabels=True, yaxis4_showticklabels=True, yaxis5_showticklabels=True,  yaxis6_showticklabels=True,  yaxis7_showticklabels=True,  yaxis8_showticklabels=True)


fig.show()
plot(fig, filename='Plots/PLS/Fig 7 - PLS metric for all samples.html', auto_open=False)

1 1
---Unicycler
---SPAdes
---SKESA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
2 1
---Unicycler
---SPAdes
---SKESA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
3 1
---Unicycler
---SPAdes
---SKESA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
4 1
---Unicycler
---SPAdes
---SKESA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
1 2
---Unicycler
---SPAdes
---SKESA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
2 2
---Unicycler
---SPAdes
---SKESA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
3 2
---Unicycler
---SPAdes
---SKESA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
4 2
---Unicycler
---SPAdes
---SKESA
---metaSPAdes
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline


'Plots/PLS/Fig 7 - PLS metric for all samples.html'