# LMAS PLS Metrics Analysis

In [4]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np

In [5]:
PLS_files = glob.glob('../Results/*/*/results/*/stats/*phred.csv')
PLS_df = pd.DataFrame(columns=['run', 'Sample', 'Assembler','Reference','Contig','Contig Length','Phred Quality Score'])

for PLS_file in PLS_files:
    report_file_name = PLS_file.split('/')[-1]
    stats_run = PLS_file.split('/')[3]
    sample_name = report_file_name.split('_')[0]
    print('Processing {0} data from {1}...'.format(report_file_name, stats_run))
    
    with open(PLS_file) as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter=",")
        next(tsvreader, None)
        for row in tsvreader:
            PLS_df = PLS_df.append({'run': stats_run,
                                    'Assembler': row[1],
                                    'Sample': sample_name,
                                    'Reference': row[2],
                                    'Contig': row[3],
                                    'Contig Length': row[4],
                                    'Phred Quality Score': row[5]}, ignore_index=True)
display(PLS_df)

Processing ENN_Unicycler_phred.csv data from run2...
Processing ENN_ABySS_phred.csv data from run2...
Processing ENN_MetaHipMer2_phred.csv data from run2...
Processing ENN_MEGAHIT_phred.csv data from run2...
Processing ENN_metaSPAdes_phred.csv data from run2...
Processing ENN_GATBMiniaPipeline_phred.csv data from run2...
Processing ENN_MINIA_phred.csv data from run2...
Processing ENN_SKESA_phred.csv data from run2...
Processing ENN_IDBA-UD_phred.csv data from run2...
Processing ENN_VelvetOptimiser_phred.csv data from run2...
Processing ENN_BCALM2_phred.csv data from run2...
Processing ENN_SPAdes_phred.csv data from run2...
Processing EMS_MetaHipMer2_phred.csv data from run2...
Processing EMS_ABySS_phred.csv data from run2...
Processing EMS_MINIA_phred.csv data from run2...
Processing EMS_BCALM2_phred.csv data from run2...
Processing EMS_metaSPAdes_phred.csv data from run2...
Processing EMS_SPAdes_phred.csv data from run2...
Processing EMS_Unicycler_phred.csv data from run2...
Processin

Unnamed: 0,run,Sample,Assembler,Reference,Contig,Contig Length,Phred Quality Score
0,run2,ENN,Unicycler,Bacillus_subtilis,3,768274,60
1,run2,ENN,Unicycler,Bacillus_subtilis,9,428612,60
2,run2,ENN,Unicycler,Bacillus_subtilis,16,343060,60
3,run2,ENN,Unicycler,Bacillus_subtilis,19,314939,60
4,run2,ENN,Unicycler,Bacillus_subtilis,21,294271,60
...,...,...,...,...,...,...,...
174725,run1,LNN,GATBMiniaPipeline,Pseudomonas_aeruginosa,688,106963,60
174726,run1,LNN,GATBMiniaPipeline,Pseudomonas_aeruginosa,691,462106,60
174727,run1,LNN,GATBMiniaPipeline,Salmonella_enterica,87,1171,60
174728,run1,LNN,GATBMiniaPipeline,Salmonella_enterica,97,1621,29.08753019184531


In [6]:
PLS_df.Reference.unique()

array(['Bacillus_subtilis', 'Enterococcus_faecalis', 'Escherichia_coli',
       'Lactobacillus_fermentum', 'Listeria_monocytogenes',
       'Pseudomonas_aeruginosa', 'Salmonella_enterica',
       'Staphylococcus_aureus', 'Staphylococcus_aureus_plasmid1',
       'Staphylococcus_aureus_plasmid2', 'Staphylococcus_aureus_plasmid3',
       'Escherichia_coli_plasmid'], dtype=object)

In [7]:
PLS_df = PLS_df.drop_duplicates(subset=['Contig Length', 'Phred Quality Score'])

In [8]:
PLS_df

Unnamed: 0,run,Sample,Assembler,Reference,Contig,Contig Length,Phred Quality Score
0,run2,ENN,Unicycler,Bacillus_subtilis,3,768274,60
1,run2,ENN,Unicycler,Bacillus_subtilis,9,428612,60
2,run2,ENN,Unicycler,Bacillus_subtilis,16,343060,60
3,run2,ENN,Unicycler,Bacillus_subtilis,19,314939,60
4,run2,ENN,Unicycler,Bacillus_subtilis,21,294271,60
...,...,...,...,...,...,...,...
174716,run1,LNN,GATBMiniaPipeline,Pseudomonas_aeruginosa,665,204663,60
174721,run1,LNN,GATBMiniaPipeline,Pseudomonas_aeruginosa,680,32608,60
174722,run1,LNN,GATBMiniaPipeline,Pseudomonas_aeruginosa,683,118879,60
174723,run1,LNN,GATBMiniaPipeline,Pseudomonas_aeruginosa,685,82860,60


In [9]:
COLOURS = ['#5876c8', '#58AEC8', '#009392', '#39B185', '#9CCB86', '#E9E29C', '#EEB479', '#E88471', '#CF597E', '#a54765', '#a42a2a', '#835221']

In [15]:
fig=make_subplots(rows=3, cols=4, subplot_titles=[x.replace('_', ' ') for x in sorted(PLS_df.Reference.unique(), key=lambda v: v.upper(), reverse=False)],
                 shared_yaxes=True, shared_xaxes=True, x_title="Contig size", y_title="Score")
row=1
col=1
for reference in sorted(PLS_df.Reference.unique(), key=lambda v: v.upper(), reverse=False):
    i=0
    print(row, col)
    for assembler in sorted(PLS_df['Assembler'].unique(), key=lambda v: v.upper(), reverse=True):
        print('---' + assembler)
        showlegend=True if col==1 and row==1 else False

        fig.add_trace(go.Scatter(y=PLS_df['Phred Quality Score'][(PLS_df['Reference'] == reference) &
                                                                         (PLS_df['Assembler'] == assembler)],
                                          x=PLS_df['Contig Length'][(PLS_df['Reference'] == reference) &
                                                                     (PLS_df['Assembler'] == assembler)],
                                          opacity=0.6, mode='markers', showlegend=False, name=assembler,
                                          marker_color=COLOURS[i]), 
                              row=row, col=col)
        i+=1
    if col == 4:
        col = 1
        row += 1
    else:
        col += 1

for i in fig['layout']['annotations']:
    i['font']['size'] = 12
fig.update_layout(legend=dict(
    orientation="h",
    y=-0.1,
    x=0
))

fig.update_layout(plot_bgcolor='rgb(255,255,255)', title="Pls Metric per reference replicon")
fig.update_xaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', rangemode='tozero')
fig.update_yaxes(showline=True, linewidth=1, linecolor='#DCDCDC', gridcolor='#DCDCDC', range=[0, 70])

fig.show()
plot(fig, filename='Plots/PLS/all_samples.html', auto_open=False)

1 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
1 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
1 3
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
1 4
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
2 1
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
2 2
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---metaSPAdes
---MetaHipMer2
---MEGAHIT
---IDBA-UD
---GATBMiniaPipeline
---BCALM2
---ABySS
2 3
---VelvetOptimiser
---Unicycler
---SPAdes
---SKESA
---MINIA
---met

'Plots/PLS/all_samples.html'

In [13]:
COLOURS = ['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c',
           '#fdbf6f', '#ff7f00', '#cab2d6', '#6a3d9a', '#ebdb75', '#b15928']

for reference in PLS_df.Reference.unique():
    fig_PLS = make_subplots(rows=3, cols=2, shared_xaxes=True, x_title="Contig size", 
                                shared_yaxes=True, y_title='PLS',
                                subplot_titles=('LNN', 'ENN', 'LHS', 'EMS', 'ERR2935805', 'ERR2984773'))
    row_coord = 1
    for sample in ['LNN', 'LHS', 'ERR2935805']:
        i=0
        for assembler in sorted(PLS_df['Assembler'].unique(), key=lambda v: v.upper()):
            fig_PLS.add_trace(go.Scatter(y=PLS_df['Phred Quality Score'][(PLS_df['Reference'] == reference) &
                                                                         (PLS_df['Assembler'] == assembler) &
                                                                         (PLS_df['Sample'] == sample)],
                                          x=PLS_df['Contig Length'][(PLS_df['Reference'] == reference) &
                                                                     (PLS_df['Assembler'] == assembler) &
                                                                     (PLS_df['Sample'] == sample)],
                                          opacity=0.6, mode='markers', showlegend=False, name=assembler,
                                          marker_color=COLOURS[i]), 
                              row=row_coord, col=1)
            i +=1
        row_coord += 1

    row_coord = 1
    for sample in ['ENN', 'EMS', 'ERR2984773']:
        i=0
        showlegend = True if (row_coord == 1) else False
        for assembler in sorted(PLS_df['Assembler'].unique(), key=lambda v: v.upper()):
            fig_PLS.add_trace(go.Scatter(y=PLS_df['Phred Quality Score'][(PLS_df['Reference'] == reference) &
                                                                         (PLS_df['Assembler'] == assembler) &
                                                                         (PLS_df['Sample'] == sample)],
                                           x=PLS_df['Contig Length'][(PLS_df['Reference'] == reference) &
                                                                     (PLS_df['Assembler'] == assembler) &
                                                                     (PLS_df['Sample'] == sample)],
                                           opacity=0.6, mode='markers', showlegend=showlegend, name=assembler,
                                        marker_color=COLOURS[i]), 
                              row=row_coord, col=2)
            i += 1
        row_coord += 1


    fig_PLS.update_layout(plot_bgcolor='rgb(255,255,255)', title_text="PLS Metrics for {}".format(reference.replace('_', ' ')))
    # grid
    fig_PLS['layout']['xaxis']['gridcolor']='#DCDCDC'
    fig_PLS['layout']['xaxis2']['gridcolor']='#DCDCDC'
    fig_PLS['layout']['xaxis3']['gridcolor']='#DCDCDC'
    fig_PLS['layout']['xaxis4']['gridcolor']='#DCDCDC'

    fig_PLS.show()
    plot(fig_PLS, filename='PLS Metric - {}.html'.format(reference.replace('_', ' ')), auto_open=False)