# LMAS Assembler Computational Performance Metrics Analysis

Different software, implementing distinct *de novo* assembly algorithms, have distinct computational requirements. As such, computational statistics were registered for each assembler.

The following metrics are analysed for all samples for each assembler:
- **Avg Time:** Average run-time formatted as “hour:minute:second”
- **CPU/Hour:** Average amount of time, in hours, of CPU usage by an assembler. Obtained CPU load from the number of CPUs and their usage percentage. 
- **Max Memory (GB):** Maximum peak memory usage by the assembler.
- **Average Read (GB):** Average data size read from disk by the assembler.
- **Average Write (GB):** Average data size written to disk by the assembler.


## Imports

In [59]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np
import re

## Global variables

In [9]:
METRICS_COLUMNS_GLOBAL = ['run','assembler', 'avgTime', 'cpus', 'max_rss', 'avgRead', 'avgWrite']
METRICS_COLUMNS_PER_SAMPLE = ['sample', 'assembler', 'run', 'cpus', 'realtime', 'rss', 'rchar', 'wchar']
ASSEMBLER_PROCESS_LIST = ["BCALM2", "GATBMINIAPIPELINE", "MINIA", "MEGAHIT", "METASPADES", "UNICYCLER", "SPADES",
                          "SKESA", "VELVETOPTIMIZER", "IDBA"]
PROCESS_TO_NAME = {"BCALM2": "BCALM2", 
                   "GATBMINIAPIPELINE": "GATBMiniaPipeline",
                   "MINIA": "MINIA",
                   "MEGAHIT": "MEGAHIT", 
                   "METASPADES": "metaSPAdes", 
                   "UNICYCLER": "Unicycler", 
                   "SPADES": "SPAdes",
                    "SKESA": "SKESA",
                   "VELVETOPTIMIZER": "VelvetOptimizer",
                   "IDBA": "IDBA-UD"}

## Util functions

In [70]:
def convert_resource_to_gb(value):
    if 'GB' in value:
        return float(value.replace('GB',''))
    elif 'MB' in value:
        return float(value.replace('MB', ''))/1000
    
def _cpu_load_parser(cpus, cpu_per, t):
    """Parses the cpu load from the number of cpus and its usage
    percentage and returns the cpu/hour measure
    Parameters
    ----------
    cpus : str
        Number of cpus allocated.
    cpu_per : str
        Percentage of cpu load measured (e.g.: 200,5%).
    t : str
        The time string can be something like '20s', '1m30s' or '300ms'.
    """
    try:
        _cpus = float(cpus)
        _cpu_per = float(cpu_per.replace(",", ".").replace("%", ""))
        hours = _hms(t) / 60 / 24

        return ((_cpu_per / (100 * _cpus)) * _cpus) * hours

    except ValueError as e:
        return 0

def _hms(s):
    """Converts a hms string into seconds.
    Parameters
    ----------
    s : str
        The hms string can be something like '20s', '1m30s' or '300ms'.
    Returns
    -------
    float
        Time in seconds.
    """

    if s == "-":
        return 0

    if s.endswith("ms"):
        return float(s.rstrip("ms")) / 1000

    fields = list(map(float, re.split("[dhms]", s)[:-1]))
    if len(fields) == 4:
        return fields[0] * 24 * 3600 + fields[1] * 3600 + fields[2] * 60 + \
            fields[3]
    if len(fields) == 3:
        return fields[0] * 3600 + fields[1] * 60 + fields[2]
    elif len(fields) == 2:
        return fields[0] * 60 + fields[1]
    else:
        return fields[0]

def _size_coverter(s):
    """Converts size string into megabytes
    Parameters
    ----------
    s : str
        The size string can be '30KB', '20MB' or '1GB'
    Returns
    -------
    float
        With the size in bytes
    """

    if s.upper().endswith("KB"):
        return float(s.rstrip("KB")) / 1024

    elif s.upper().endswith(" B"):
        return float(s.rstrip("B")) / 1024 / 1024

    elif s.upper().endswith("MB"):
        return float(s.rstrip("MB"))

    elif s.upper().endswith("GB"):
        return float(s.rstrip("GB")) * 1024

    elif s.upper().endswith("TB"):
        return float(s.rstrip("TB")) * 1024 * 1024

    else:
        return float(0)

def _size_compress(s):
    """Shortens a megabytes string.
    """

    if s / 1024 > 1:
        return "{}GB".format(round(s / 1024, 1))
    else:
        return "{}MB".format(s)

## Global pipeline performance statistics

### Load Data

In [11]:
report_glob = glob.glob('../Results/*/report/performance_metadata.json')
global_pipeline_stats_df = pd.DataFrame(columns=METRICS_COLUMNS_GLOBAL)

for pipeline_report_file in report_glob:
    report_file_name = pipeline_report_file.split('/')[-1]
    stats_run = pipeline_report_file.split('/')[-3]
    print('Processing {0} data from {1}...'.format(report_file_name, stats_run))
    
    with open(pipeline_report_file) as _fh:
            _json = json.load(_fh)
            for _stats in _json:
                global_pipeline_stats_df = global_pipeline_stats_df.append({'run': stats_run,
                                                                           'assembler': PROCESS_TO_NAME[_stats['assembler']],
                                                                           'avgTime': _stats['avgTime'],
                                                                           'cpus': _stats['cpus'],
                                                                           'max_rss': convert_resource_to_gb(_stats['max_rss']),
                                                                           'avgRead': convert_resource_to_gb(_stats['avgRead']),
                                                                           'avgWrite': convert_resource_to_gb(_stats['avgWrite'])}, 
                                                                           ignore_index=True)

Processing performance_metadata.json data from run1...


### Plot  Mean Statistics

In [12]:
fig_global = make_subplots(rows=2, cols=2, shared_xaxes=True, x_title="Assembler", 
                           subplot_titles=('CPU Usage', 'Max Memory', 'Average Read', 'Average Write'))
row_coord = 1
col_coord = 1

for column in global_pipeline_stats_df.columns:
    if column not in ['run', 'assembler', 'avgTime']:
        fig_global.add_trace(go.Violin(y=global_pipeline_stats_df[column], box_visible=True, line_color='black',
                               meanline_visible=True, fillcolor='lightseagreen', opacity=0.6,
                               x=global_pipeline_stats_df['assembler']),
                      row=row_coord, col=col_coord)
        fig_global.add_trace(go.Scatter(y=[global_pipeline_stats_df[column].mean()]*len(global_pipeline_stats_df['assembler']), 
                                        x=global_pipeline_stats_df['assembler'], mode='lines',
                                        line=dict(color="crimson"), opacity=0.6),row=row_coord, col=col_coord)
        if col_coord == 2:
            col_coord = 1
            row_coord += 1
        else:
            col_coord +=1

fig_global.update_layout(showlegend=False,plot_bgcolor='rgb(255,255,255)', title_text="Global Computational Performance Metrics")
# grid
fig_global['layout']['xaxis']['gridcolor']='#DCDCDC'
fig_global['layout']['xaxis2']['gridcolor']='#DCDCDC'
fig_global['layout']['xaxis3']['gridcolor']='#DCDCDC'
fig_global['layout']['xaxis4']['gridcolor']='#DCDCDC'
# y-axis legends
fig_global['layout']['yaxis']['title']='CPU/hour'
fig_global['layout']['yaxis2']['title']='GB'
fig_global['layout']['yaxis3']['title']='GB'
fig_global['layout']['yaxis4']['title']='GB'

fig_global.show()

In [47]:
timeDF=round(pd.to_timedelta(global_pipeline_stats_df['avgTime'].str.strip())/ np.timedelta64(1, 'h'), 2)

fig_time = go.Figure()
fig_time.add_trace(go.Bar(y=timeDF, x=global_pipeline_stats_df['assembler'], text=timeDF,textposition='outside', 
                          showlegend=False, marker_color='darkgray'))
fig_time.add_trace(go.Scatter(y=[timeDF.mean()]*len(global_pipeline_stats_df['assembler']), 
                              x=global_pipeline_stats_df['assembler'], mode='lines',
                              line=dict(color="crimson"), opacity=0.6, name='Mean'))

fig_time.update_layout(showlegend=True, plot_bgcolor='rgb(255,255,255)', title_text="Average Run Time")
fig_time['layout']['xaxis']['gridcolor']='#DCDCDC'
fig_time['layout']['yaxis']['title']='Hours'

fig_time.show()

In [83]:
pipeline_stats_glob = glob.glob('../Results/*/pipeline_stats.txt')
pipeline_stats_df = pd.DataFrame(columns=METRICS_COLUMNS_PER_SAMPLE)
METRICS_COLUMNS_PER_SAMPLE = ['sample', 'assembler', 'run', 'cpus', 'realtime', 'rss', 'rchar', 'wchar']


for statsfile in pipeline_stats_glob:
    print(statsfile)
    stats_file_name = statsfile.split('/')[-1]
    stats_run = statsfile.split('/')[-2]
    print('Processing {0} data from {1}...'.format(stats_file_name, stats_run))
    with open(statsfile) as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter="\t")
        for row in tsvreader:
            if row[2] in ASSEMBLER_PROCESS_LIST:
                pipeline_stats_df = pipeline_stats_df.append({'sample': row[3],
                                                              'assembler': PROCESS_TO_NAME[row[2]],
                                                              'run': stats_run,
                                                              "cpus": _cpu_load_parser(row[8], row[15], row[13]),
                                                              "realtime": _hms(row[13]),
                                                              "rss": convert_resource_to_gb(_size_compress(_size_coverter(row[17]))),
                                                              "rchar": convert_resource_to_gb(_size_compress(_size_coverter(row[19]))),
                                                              "wchar": convert_resource_to_gb(_size_compress(_size_coverter(row[20])))},
                                                             ignore_index=True)

log_distributed = ['LHS', 'LNN', 'ERR2935805']
even_distribution = ['EHS', 'LNN', 'ERR2984773']
pipeline_stats_df['distribution'] = np.where(pipeline_stats_df['sample'].isin(log_distributed), 'Log', 'Even')
display(pipeline_stats_df)               


../Results/run1/pipeline_stats.txt
Processing pipeline_stats.txt data from run1...


Unnamed: 0,sample,assembler,run,cpus,realtime,rss,rchar,wchar,distribution
0,LHS,GATBMiniaPipeline,run1,53.933894,16083.0,7.8,207.6,98.7,Log
1,ENN,MEGAHIT,run1,3.291422,1438.0,1.7,16.5,7.0,Even
2,EHS,MINIA,run1,0.749972,245.0,5.0,7.7,3.8,Even
3,EHS,VelvetOptimizer,run1,13.090128,9032.0,16.0,304.5,39.9,Even
4,EHS,GATBMiniaPipeline,run1,23.609687,6450.0,6.9,65.6,26.5,Even
...,...,...,...,...,...,...,...,...,...
65,LNN,GATBMiniaPipeline,run1,33.229962,9211.0,7.6,137.5,81.8,Log
66,ERR2935805,VelvetOptimizer,run1,6.025556,2992.0,48.0,156.3,65.3,Log
67,ENN,GATBMiniaPipeline,run1,19.923322,4466.0,6.9,53.0,23.7,Even
68,LNN,GATBMiniaPipeline,run1,33.476038,9189.0,7.6,139.8,81.7,Log


In [90]:
fig_per_sample = make_subplots(rows=2, cols=2, shared_xaxes=True, x_title="Assembler", 
                           subplot_titles=('CPU', 'Memory', 'Read', 'Write'))
row_coord = 1
col_coord = 1

for column in ['cpus', 'rss', 'rchar', 'wchar']:
    showlegend = True if (row_coord == 1 and col_coord == 1) else False

    #log
    fig_per_sample.add_trace(go.Violin(y=pipeline_stats_df[column][pipeline_stats_df['distribution'] == 'Log'],
                                    x=pipeline_stats_df['assembler'][pipeline_stats_df['distribution'] == 'Log'],
                                    legendgroup='Log', scalegroup='Log', name='Log', box_visible=True, line_color='black',
                                    meanline_visible=True, fillcolor='lightseagreen', opacity=0.6, side='negative',
                                    showlegend=showlegend, spanmode='hard'),
                          row=row_coord, col=col_coord)
    
    #even
    fig_per_sample.add_trace(go.Violin(y=pipeline_stats_df[column][pipeline_stats_df['distribution'] == 'Even'],
                                    x=pipeline_stats_df['assembler'][pipeline_stats_df['distribution'] == 'Even'],
                                    legendgroup='Even', scalegroup='Even', name='Even', box_visible=True, line_color='black',
                                    meanline_visible=True, fillcolor='orange', opacity=0.6, side='positive',
                                    showlegend=showlegend, spanmode='hard'), 
                          row=row_coord, col=col_coord)

    
    fig_per_sample.add_trace(go.Scatter(y=[pipeline_stats_df[column].mean()]*len(pipeline_stats_df['assembler']), 
                                    x=pipeline_stats_df['assembler'], mode='lines', name='mean',
                                    line=dict(color="crimson"), opacity=0.6, showlegend=showlegend),
                          row=row_coord, col=col_coord)
    if col_coord == 2:
        col_coord = 1
        row_coord += 1
    else:
        col_coord +=1

fig_per_sample.update_layout(plot_bgcolor='rgb(255,255,255)', title_text="Global Computational Performance Metrics")
# grid
fig_per_sample['layout']['xaxis']['gridcolor']='#DCDCDC'
fig_per_sample['layout']['xaxis2']['gridcolor']='#DCDCDC'
fig_per_sample['layout']['xaxis3']['gridcolor']='#DCDCDC'
fig_per_sample['layout']['xaxis4']['gridcolor']='#DCDCDC'
# y-axis legends
fig_per_sample['layout']['yaxis']['title']='CPU/hour'
fig_per_sample['layout']['yaxis2']['title']='GB'
fig_per_sample['layout']['yaxis3']['title']='GB'
fig_per_sample['layout']['yaxis4']['title']='GB'

fig_per_sample.update_layout(violingap=0, violinmode='overlay')


fig_per_sample.show()

In [93]:
round(pd.to_timedelta(pipeline_stats_df['realtime'])/ np.timedelta64(1, 'h'), 2)

0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
     ... 
65    0.0
66    0.0
67    0.0
68    0.0
69    0.0
Name: realtime, Length: 70, dtype: float64

In [None]:
timeDF=round(pd.to_timedelta(pipeline_stats_df['realtime'].str.strip())/ np.timedelta64(1, 'h'), 2)

fig_time = go.Figure()
fig_time.add_trace(go.Bar(y=timeDF, x=global_pipeline_stats_df['assembler'], text=timeDF,textposition='outside', 
                          showlegend=False, marker_color='darkgray'))
fig_time.add_trace(go.Scatter(y=[timeDF.mean()]*len(global_pipeline_stats_df['assembler']), 
                              x=global_pipeline_stats_df['assembler'], mode='lines',
                              line=dict(color="crimson"), opacity=0.6, name='Mean'))

fig_time.update_layout(showlegend=True, plot_bgcolor='rgb(255,255,255)', title_text="Average Run Time")
fig_time['layout']['xaxis']['gridcolor']='#DCDCDC'
fig_time['layout']['yaxis']['title']='Hours'

fig_time.show()


fig_time_delta = go.Figure()

fig_misassembly.add_trace(go.Violin(y=bacillus_subtillis_df['misassemblies'][bacillus_subtillis_df['distribution'] == 'Log'], 
                                    x=bacillus_subtillis_df['assembler'][bacillus_subtillis_df['distribution'] == 'Log'],
                                    box_visible=True, line_color='black',legendgroup='Log', scalegroup='Log', name='Log',
                                    meanline_visible=True, fillcolor='lightseagreen', opacity=0.6, side='negative',
                                    showlegend=True, spanmode='hard'))
fig_misassembly.add_trace(go.Violin(y=bacillus_subtillis_df['misassemblies'][bacillus_subtillis_df['distribution'] == 'Even'], 
                                    x=bacillus_subtillis_df['assembler'][bacillus_subtillis_df['distribution'] == 'Even'],
                                    box_visible=True, line_color='black',legendgroup='Even', scalegroup='Even', name='Even',
                                    meanline_visible=True, fillcolor='orange', opacity=0.6, side='positive',
                                    showlegend=True, spanmode='hard'))
fig_misassembly.add_trace(go.Scatter(y=[bacillus_subtillis_df['misassemblies'].mean()]*len(bacillus_subtillis_df['assembler']), 
                                     x=bacillus_subtillis_df['assembler'], mode='lines',name='mean',
                                     line=dict(color="crimson"), opacity=0.6))



fig_misassembly.update_layout(showlegend=True,plot_bgcolor='rgb(255,255,255)', title_text="Misassemblies for Bacillus Subtillis")
# grid
fig_misassembly['layout']['xaxis']['gridcolor']='#DCDCDC'

fig_misassembly.update_layout(violingap=0, violinmode='overlay')

fig_misassembly.show()