# LMAS PLS Metrics Analysis

In [1]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np

In [21]:
PLS_files = glob.glob('../Results/*/results/*/stats/*phred.csv')
PLS_df = pd.DataFrame(columns=['run', 'Sample', 'Assembler','Reference','Contig','Contig Length','Phred Quality Score'])

for PLS_file in PLS_files:
    report_file_name = PLS_file.split('/')[-1]
    stats_run = PLS_file.split('/')[2]
    sample_name = report_file_name.split('_')[0]
    print('Processing {0} data from {1}...'.format(report_file_name, stats_run))
    
    with open(PLS_file) as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter=",")
        next(tsvreader, None)
        for row in tsvreader:
            PLS_df = PLS_df.append({'run': stats_run,
                                    'Assembler': row[1],
                                    'Sample': sample_name,
                                    'Reference': row[2],
                                    'Contig': row[3],
                                    'Contig Length': row[4],
                                    'Phred Quality Score': row[5]}, ignore_index=True)
display(PLS_df)

Processing LNN_SPAdes_phred.csv data from run1...
Processing LNN_BCALM2_phred.csv data from run1...
Processing LNN_MEGAHIT_phred.csv data from run1...
Processing LNN_SKESA_phred.csv data from run1...
Processing LNN_metaSPAdes_phred.csv data from run1...
Processing LNN_Unicycler_phred.csv data from run1...
Processing LNN_IDBA-UD_phred.csv data from run1...
Processing LNN_MINIA_phred.csv data from run1...
Processing ERR2935805_SPAdes_phred.csv data from run1...
Processing ERR2935805_SKESA_phred.csv data from run1...
Processing ERR2935805_BCALM2_phred.csv data from run1...
Processing ERR2935805_metaSPAdes_phred.csv data from run1...
Processing ERR2935805_Unicycler_phred.csv data from run1...
Processing ERR2935805_GATBMiniaPipeline_phred.csv data from run1...
Processing ERR2935805_MINIA_phred.csv data from run1...
Processing ERR2935805_MEGAHIT_phred.csv data from run1...
Processing ERR2935805_IDBA-UD_phred.csv data from run1...
Processing LHS_GATBMiniaPipeline_phred.csv data from run1...
P

Unnamed: 0,run,Sample,Assembler,Reference,Contig,Contig Length,Phred Quality Score
0,run1,LNN,SPAdes,Bacillus_subtilis,NODE_2_length_708415_cov_22.718772,708415,60
1,run1,LNN,SPAdes,Bacillus_subtilis,NODE_10_length_367881_cov_22.637687,367881,60
2,run1,LNN,SPAdes,Bacillus_subtilis,NODE_14_length_315046_cov_22.752745,315046,60
3,run1,LNN,SPAdes,Bacillus_subtilis,NODE_16_length_294381_cov_22.729273,294381,54.68909776243896
4,run1,LNN,SPAdes,Bacillus_subtilis,NODE_17_length_269110_cov_22.891680,269110,60
...,...,...,...,...,...,...,...
76623,run1,ERR2984773,MEGAHIT,Staphylococcus_aureus,k141_14955,77242,41.888835059518506
76624,run1,ERR2984773,MEGAHIT,Staphylococcus_aureus,k141_15530,415771,60.0
76625,run1,ERR2984773,MEGAHIT,Staphylococcus_aureus_plasmid1,k141_15633,6795,12.444784960496019
76626,run1,ERR2984773,MEGAHIT,Staphylococcus_aureus_plasmid2,k141_20715,2357,60.0


In [22]:
PLS_df.Reference.unique()

array(['Bacillus_subtilis', 'Enterococcus_faecalis',
       'Escherichia_coli_plasmid', 'Escherichia_coli',
       'Lactobacillus_fermentum', 'Listeria_monocytogenes',
       'Pseudomonas_aeruginosa', 'Salmonella_enterica',
       'Staphylococcus_aureus', 'Staphylococcus_aureus_plasmid1',
       'Staphylococcus_aureus_plasmid2', 'Staphylococcus_aureus_plasmid3'],
      dtype=object)

In [41]:
COLOURS = ['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c',
           '#fdbf6f', '#ff7f00', '#cab2d6', '#6a3d9a', '#ebdb75', '#b15928']

for reference in PLS_df.Reference.unique():
    fig_PLS = make_subplots(rows=3, cols=2, shared_xaxes=True, x_title="Contig size", 
                                shared_yaxes=True, y_title='PLS',
                                subplot_titles=('LNN', 'ENN', 'LHS', 'EHS', 'ERR2935805', 'ERR2984773'))
    row_coord = 1
    for sample in ['LNN', 'LHS', 'ERR2935805']:
        i=0
        for assembler in sorted(PLS_df['Assembler'].unique(), key=lambda v: v.upper()):
            fig_PLS.add_trace(go.Scatter(y=PLS_df['Phred Quality Score'][(PLS_df['Reference'] == reference) &
                                                                         (PLS_df['Assembler'] == assembler) &
                                                                         (PLS_df['Sample'] == sample)],
                                          x=PLS_df['Contig Length'][(PLS_df['Reference'] == reference) &
                                                                     (PLS_df['Assembler'] == assembler) &
                                                                     (PLS_df['Sample'] == sample)],
                                          opacity=0.6, mode='markers', showlegend=False, name=assembler,
                                          marker_color=COLOURS[i]), 
                              row=row_coord, col=1)
            i +=1
        row_coord += 1

    row_coord = 1
    for sample in ['ENN', 'EHS', 'ERR2984773']:
        i=0
        showlegend = True if (row_coord == 1) else False
        for assembler in sorted(PLS_df['Assembler'].unique(), key=lambda v: v.upper()):
            fig_PLS.add_trace(go.Scatter(y=PLS_df['Phred Quality Score'][(PLS_df['Reference'] == reference) &
                                                                         (PLS_df['Assembler'] == assembler) &
                                                                         (PLS_df['Sample'] == sample)],
                                           x=PLS_df['Contig Length'][(PLS_df['Reference'] == reference) &
                                                                     (PLS_df['Assembler'] == assembler) &
                                                                     (PLS_df['Sample'] == sample)],
                                           opacity=0.6, mode='markers', showlegend=showlegend, name=assembler,
                                        marker_color=COLOURS[i]), 
                              row=row_coord, col=2)
            i += 1
        row_coord += 1


    fig_PLS.update_layout(plot_bgcolor='rgb(255,255,255)', title_text="PLS Metrics for {}".format(reference.replace('_', ' ')))
    # grid
    fig_PLS['layout']['xaxis']['gridcolor']='#DCDCDC'
    fig_PLS['layout']['xaxis2']['gridcolor']='#DCDCDC'
    fig_PLS['layout']['xaxis3']['gridcolor']='#DCDCDC'
    fig_PLS['layout']['xaxis4']['gridcolor']='#DCDCDC'

    fig_PLS.show()
    plot(fig_PLS, filename='PLS Metric - {}.html'.format(reference.replace('_', ' ')), auto_open=False)