# LMAS Assembler Computational Performance Metrics Analysis

Different software, implementing distinct *de novo* assembly algorithms, have distinct computational requirements. As such, computational statistics were registered for each assembler.

The following metrics are analysed for all samples for each assembler:
- **Avg Time:** Average run-time formatted as “hour:minute:second”
- **CPU/Hour:** Average amount of time, in hours, of CPU usage by an assembler. Obtained CPU load from the number of CPUs and their usage percentage. 
- **Max Memory (GB):** Maximum peak memory usage by the assembler.
- **Average Read (GB):** Average data size read from disk by the assembler.
- **Average Write (GB):** Average data size written to disk by the assembler.


## Imports

In [1]:
import sys
from plotly.offline import plot
import glob
import fnmatch
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import plot
import json
import pandas as pd
from itertools import groupby
import csv
import numpy as np
import re

## Global variables

In [8]:
METRICS_COLUMNS_GLOBAL = ['run','assembler', 'avgTime', 'cpus', 'max_rss', 'avgRead', 'avgWrite']
METRICS_COLUMNS_PER_SAMPLE = ['sample', 'assembler', 'run', 'cpus', 'realtime', 'rss', 'rchar', 'wchar']
ASSEMBLER_PROCESS_LIST = ["ABYSS", "BCALM2", "GATBMINIAPIPELINE", "METAHIPMER2", "MINIA", "MEGAHIT", "METASPADES", "UNICYCLER", "SPADES",
                          "SKESA", "VELVETOPTIMISER", "IDBA"]
PROCESS_TO_NAME = {"ABYSS": "AbYSS",
                   "BCALM2": "BCALM2", 
                   "GATBMINIAPIPELINE": "GATBMiniaPipeline",
                   "MINIA": "MINIA",
                   "MEGAHIT": "MEGAHIT",
                   "METAHIPMER2": "MetaHipMer2",
                   "METASPADES": "metaSPAdes", 
                   "UNICYCLER": "Unicycler", 
                   "SPADES": "SPAdes",
                    "SKESA": "SKESA",
                   "VELVETOPTIMISER": "VelvetOptimiser",
                   "IDBA": "IDBA-UD"}
COLOURS = ['#004B93', "#009392", "#39B185", "#9CCB86", "#E9E29C", "#EEB479", "#E88471","#CF597E", 'darkgray']

## Util functions

In [5]:
def convert_resource_to_gb(value):
    if 'GB' in value:
        return float(value.replace('GB',''))
    elif 'MB' in value:
        return float(value.replace('MB', ''))/1000
    
def _cpu_load_parser(cpus, cpu_per, t):
    """Parses the cpu load from the number of cpus and its usage
    percentage and returns the cpu/hour measure
    Parameters
    ----------
    cpus : str
        Number of cpus allocated.
    cpu_per : str
        Percentage of cpu load measured (e.g.: 200,5%).
    t : str
        The time string can be something like '20s', '1m30s' or '300ms'.
    """
    try:
        _cpus = float(cpus)
        _cpu_per = float(cpu_per.replace(",", ".").replace("%", ""))
        hours = _hms(t) / 60 / 24

        return ((_cpu_per / (100 * _cpus)) * _cpus) * hours

    except ValueError as e:
        return 0

def _hms(s):
    """Converts a hms string into seconds.
    Parameters
    ----------
    s : str
        The hms string can be something like '20s', '1m30s' or '300ms'.
    Returns
    -------
    float
        Time in seconds.
    """

    if s == "-":
        return 0

    if s.endswith("ms"):
        return float(s.rstrip("ms")) / 1000

    fields = list(map(float, re.split("[dhms]", s)[:-1]))
    if len(fields) == 4:
        return fields[0] * 24 * 3600 + fields[1] * 3600 + fields[2] * 60 + \
            fields[3]
    if len(fields) == 3:
        return fields[0] * 3600 + fields[1] * 60 + fields[2]
    elif len(fields) == 2:
        return fields[0] * 60 + fields[1]
    else:
        return fields[0]

def _size_coverter(s):
    """Converts size string into megabytes
    Parameters
    ----------
    s : str
        The size string can be '30KB', '20MB' or '1GB'
    Returns
    -------
    float
        With the size in bytes
    """

    if s.upper().endswith("KB"):
        return float(s.rstrip("KB")) / 1024

    elif s.upper().endswith(" B"):
        return float(s.rstrip("B")) / 1024 / 1024

    elif s.upper().endswith("MB"):
        return float(s.rstrip("MB"))

    elif s.upper().endswith("GB"):
        return float(s.rstrip("GB")) * 1024

    elif s.upper().endswith("TB"):
        return float(s.rstrip("TB")) * 1024 * 1024

    else:
        return float(0)

def _size_compress(s):
    """Shortens a megabytes string.
    """

    if s / 1024 > 1:
        return "{}GB".format(round(s / 1024, 1))
    else:
        return "{}MB".format(s)

## Average pipeline performance statistics

### Load Data

In [9]:
report_glob = glob.glob('../Results/*/*/report/performance_metadata.json')
global_pipeline_stats_df = pd.DataFrame(columns=METRICS_COLUMNS_GLOBAL)

for pipeline_report_file in report_glob:
    report_file_name = pipeline_report_file.split('/')[-1]
    stats_run = pipeline_report_file.split('/')[-3]
    print('Processing {0} data from {1}...'.format(report_file_name, stats_run))
    
    with open(pipeline_report_file) as _fh:
            _json = json.load(_fh)
            for _stats in _json:
                global_pipeline_stats_df = global_pipeline_stats_df.append({'run': stats_run,
                                                                           'assembler': PROCESS_TO_NAME[_stats['assembler']],
                                                                           'avgTime': _stats['avgTime'],
                                                                           'cpus': _stats['cpus'],
                                                                           'max_rss': convert_resource_to_gb(_stats['max_rss']),
                                                                           'avgRead': convert_resource_to_gb(_stats['avgRead']),
                                                                           'avgWrite': convert_resource_to_gb(_stats['avgWrite'])}, 
                                                                           ignore_index=True)

Processing performance_metadata.json data from run1...
Processing performance_metadata.json data from run2...
Processing performance_metadata.json data from run3...


###  Plot Data

In [12]:

fig_global = make_subplots(rows=3, cols=2, shared_xaxes=True, x_title="Assembler", 
                           subplot_titles=('CPU Usage', 'Max Memory', 'Average Read', 'Average Write', 'Average Run Time'),
                           specs=[[{}, {}],[{}, {}],[{"colspan": 2}, None]])
row_coord = 1
col_coord = 1

for column in ['cpus', 'max_rss', 'avgRead', 'avgWrite']:
    fig_global.add_trace(go.Violin(y=global_pipeline_stats_df[column], box_visible=True, line_color='black',
                           meanline_visible=True, fillcolor='lightseagreen', opacity=0.6,
                           x=global_pipeline_stats_df['assembler']),
                  row=row_coord, col=col_coord)
    fig_global.add_trace(go.Scatter(y=[global_pipeline_stats_df[column].mean()]*len(global_pipeline_stats_df['assembler']), 
                                    x=global_pipeline_stats_df['assembler'], mode='lines',
                                    line=dict(color="crimson"), opacity=0.6),row=row_coord, col=col_coord)
    if col_coord == 2:
        col_coord = 1
        row_coord += 1
    else:
        col_coord +=1

TimeDFmean = pd.Series()
for run in global_pipeline_stats_df['run'].unique():
    print(run)
    timeDF=round(pd.to_timedelta(global_pipeline_stats_df['avgTime'][global_pipeline_stats_df['run'] == run].str.strip())/ np.timedelta64(1, 'h'), 2)

    fig_global.add_trace(go.Bar(y=timeDF, x=global_pipeline_stats_df['assembler'], text=timeDF,textposition='outside', 
                              showlegend=False, marker_color='darkgray', name=run),  row=3, col=1)
    TimeDFmean = pd.concat([TimeDFmean, timeDF], ignore_index=True)
    
fig_global.add_trace(go.Scatter(y=[TimeDFmean.mean()]*len(global_pipeline_stats_df['assembler']), 
                              x=global_pipeline_stats_df['assembler'], mode='lines',
                              line=dict(color="crimson"), opacity=0.6, name='Mean'), row=3, col=1)

fig_global.update_layout(showlegend=False,plot_bgcolor='rgb(255,255,255)', title_text="Average Computational Performance Metrics per Assembler",
                        barmode='group')
# grid
fig_global['layout']['xaxis']['gridcolor']='#DCDCDC'
fig_global['layout']['xaxis2']['gridcolor']='#DCDCDC'
fig_global['layout']['xaxis3']['gridcolor']='#DCDCDC'
fig_global['layout']['xaxis4']['gridcolor']='#DCDCDC'
fig_global['layout']['xaxis5']['gridcolor']='#DCDCDC'
# y-axis legends
fig_global['layout']['yaxis']['title']='CPU/hour'
fig_global['layout']['yaxis2']['title']='GB'
fig_global['layout']['yaxis3']['title']='GB'
fig_global['layout']['yaxis4']['title']='GB'
fig_global['layout']['yaxis5']['title']='Hours'
fig_global.update_xaxes(categoryorder='category ascending')

fig_global.show()

run1
run2
run3






In [13]:
plot(fig_global, filename='Plots/Performance/Average Computational Performance Metrics per Assembler.html', auto_open=False)

'Plots/Performance/Average Computational Performance Metrics per Assembler.html'

## Pipeline performance statistics per Sample

### Load Data

In [14]:
pipeline_stats_glob = glob.glob('../Results/*/*/pipeline_stats.txt')
pipeline_stats_df = pd.DataFrame(columns=METRICS_COLUMNS_PER_SAMPLE)
METRICS_COLUMNS_PER_SAMPLE = ['sample', 'assembler', 'run', 'cpus', 'realtime', 'rss', 'rchar', 'wchar']


for statsfile in pipeline_stats_glob:
    print(statsfile)
    stats_file_name = statsfile.split('/')[-1]
    stats_run = statsfile.split('/')[-2]
    print('Processing {0} data from {1}...'.format(stats_file_name, stats_run))
    with open(statsfile) as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter="\t")
        for row in tsvreader:
            if row[2] in ASSEMBLER_PROCESS_LIST:
                pipeline_stats_df = pipeline_stats_df.append({'sample': row[3],
                                                              'assembler': PROCESS_TO_NAME[row[2]],
                                                              'run': stats_run,
                                                              "cpus": _cpu_load_parser(row[8], row[15], row[13]),
                                                              "realtime": _hms(row[13]),
                                                              "rss": convert_resource_to_gb(_size_compress(_size_coverter(row[17]))),
                                                              "rchar": convert_resource_to_gb(_size_compress(_size_coverter(row[19]))),
                                                              "wchar": convert_resource_to_gb(_size_compress(_size_coverter(row[20])))},
                                                             ignore_index=True)

log_distributed = ['LHS', 'LNN', 'ERR2935805']
even_distribution = ['EMS', 'LNN', 'ERR2984773']
pipeline_stats_df['distribution'] = np.where(pipeline_stats_df['sample'].isin(log_distributed), 'Log', 'Even')
pipeline_stats_df['time_hours']=round(pd.to_timedelta(pipeline_stats_df['realtime'], unit='S')/ np.timedelta64(1, 'h'), 2)
display(pipeline_stats_df)               


../Results/Even/run1/pipeline_stats.txt
Processing pipeline_stats.txt data from run1...
../Results/Even/run2/pipeline_stats.txt
Processing pipeline_stats.txt data from run2...
../Results/Even/run3/pipeline_stats.txt
Processing pipeline_stats.txt data from run3...


Unnamed: 0,sample,assembler,run,cpus,realtime,rss,rchar,wchar,distribution,time_hours
0,ENN,BCALM2,run1,0.560800,144.0,5.1,5.9,3.4,Even,0.04
1,ENN,MINIA,run1,0.616594,164.0,5.1,6.5,3.5,Even,0.05
2,EMS,BCALM2,run1,0.734722,230.0,6.7,8.5,4.3,Even,0.06
3,EMS,MINIA,run1,0.789094,206.0,6.6,9.1,4.4,Even,0.06
4,ERR2984773,BCALM2,run1,1.016652,327.0,4.7,16.2,6.8,Even,0.09
...,...,...,...,...,...,...,...,...,...,...
118,EMS,Unicycler,run3,35.119713,37797.0,6.3,46.8,281.9,Even,10.50
119,ERR2984773,Unicycler,run3,60.384424,68738.0,14.1,48.0,555.6,Even,19.09
120,ERR2984773,AbYSS,run3,0.000000,3955.0,0.0,0.0,0.0,Even,1.10
121,ERR2984773,AbYSS,run3,0.000000,3933.0,0.0,0.0,0.0,Even,1.09


In [15]:
pipeline_stats_df.to_csv("Tables/Resources per sample per run")

###  Plot Data

In [16]:
fig_per_sample = make_subplots(rows=3, cols=2, shared_xaxes=True, 
                               subplot_titles=('CPU', 'Memory', 'Read', 'Write', 'Run Time'),
                               specs=[[{}, {}],[{}, {}],[{"colspan": 2}, None]])
row_coord = 1
col_coord = 1

for column in ['cpus', 'rss', 'rchar', 'wchar']:
    showlegend = True if (row_coord == 1 and col_coord == 1) else False

    #log
    fig_per_sample.add_trace(go.Violin(y=pipeline_stats_df[column][pipeline_stats_df['distribution'] == 'Log'],
                                    x=pipeline_stats_df['assembler'][pipeline_stats_df['distribution'] == 'Log'],
                                    legendgroup='Log', scalegroup='Log', name='Log', box_visible=True, line_color='black',
                                    meanline_visible=True, fillcolor='lightseagreen', opacity=0.6, side='negative',
                                    showlegend=showlegend, spanmode='hard'),
                          row=row_coord, col=col_coord)
    
    #even
    fig_per_sample.add_trace(go.Violin(y=pipeline_stats_df[column][pipeline_stats_df['distribution'] == 'Even'],
                                    x=pipeline_stats_df['assembler'][pipeline_stats_df['distribution'] == 'Even'],
                                    legendgroup='Even', scalegroup='Even', name='Even', box_visible=True, line_color='black',
                                    meanline_visible=True, fillcolor='orange', opacity=0.6, side='positive',
                                    showlegend=showlegend, spanmode='hard'), 
                          row=row_coord, col=col_coord)

    
    fig_per_sample.add_trace(go.Scatter(y=[pipeline_stats_df[column].mean()]*len(pipeline_stats_df['assembler']), 
                                    x=pipeline_stats_df['assembler'], mode='lines', name='mean',
                                    line=dict(color="crimson"), opacity=0.6, showlegend=showlegend),
                          row=row_coord, col=col_coord)
    if col_coord == 2:
        col_coord = 1
        row_coord += 1
    else:
        col_coord +=1

fig_per_sample.add_trace(go.Violin(y=pipeline_stats_df['time_hours'][pipeline_stats_df['distribution'] == 'Log'],
                             x=pipeline_stats_df['assembler'][pipeline_stats_df['distribution'] == 'Log'],
                                legendgroup='Log', scalegroup='Log', name='Log', box_visible=True, line_color='black',
                                meanline_visible=True, fillcolor='lightseagreen', opacity=0.6, side='negative',
                                showlegend=False, spanmode='hard'),  row=3, col=1)
fig_per_sample.add_trace(go.Violin(y=pipeline_stats_df['time_hours'][pipeline_stats_df['distribution'] == 'Even'],
                                x=pipeline_stats_df['assembler'][pipeline_stats_df['distribution'] == 'Even'],
                                legendgroup='Even', scalegroup='Even', name='Even', box_visible=True, line_color='black',
                                meanline_visible=True, fillcolor='orange', opacity=0.6, side='positive',
                                showlegend=False, spanmode='hard'),  row=3, col=1)
fig_per_sample.add_trace(go.Scatter(y=[pipeline_stats_df['time_hours'].mean()]*len(global_pipeline_stats_df['assembler']), 
                              x=global_pipeline_stats_df['assembler'], mode='lines',
                              line=dict(color="crimson"), opacity=0.6, name='Mean', showlegend=False),  row=3, col=1)

fig_per_sample.update_layout(plot_bgcolor='rgb(255,255,255)', title_text="Computational Performance Metrics Distribution")
# grid
fig_per_sample['layout']['xaxis']['gridcolor']='#DCDCDC'
fig_per_sample['layout']['xaxis2']['gridcolor']='#DCDCDC'
fig_per_sample['layout']['xaxis3']['gridcolor']='#DCDCDC'
fig_per_sample['layout']['xaxis4']['gridcolor']='#DCDCDC'
fig_per_sample['layout']['xaxis5']['gridcolor']='#DCDCDC'
# y-axis legends
fig_per_sample['layout']['yaxis']['title']='CPU/hour'
fig_per_sample['layout']['yaxis2']['title']='GB'
fig_per_sample['layout']['yaxis3']['title']='GB'
fig_per_sample['layout']['yaxis4']['title']='GB'
fig_per_sample['layout']['yaxis5']['title']='Hours'

fig_per_sample.update_layout(violingap=0, violinmode='overlay')
fig_per_sample.update_xaxes(categoryorder='category ascending')


fig_per_sample.show()

In [17]:
plot(fig_per_sample, filename='Plots/Performance/Computational Performance Metrics Distribution.html', auto_open=False)

'Plots/Performance/Computational Performance Metrics Distribution.html'

In [18]:
fig_per_sample = make_subplots(rows=3, cols=2, shared_xaxes=True, 
                               subplot_titles=('CPU', 'Memory', 'Read', 'Write', 'Run Time'),
                               specs=[[{}, {}],[{}, {}],[{"colspan": 2}, None]])
row_coord = 1
col_coord = 1

for column in ['cpus', 'rss', 'rchar', 'wchar']:
    showlegend = True if (row_coord == 1 and col_coord == 1) else False
    i=0
    for sample in pipeline_stats_df['sample'].unique():

        fig_per_sample.add_trace(go.Violin(y=pipeline_stats_df[column][pipeline_stats_df['sample'] == sample],
                                        x=pipeline_stats_df['assembler'][pipeline_stats_df['sample'] == sample],
                                        legendgroup='Log', scalegroup='Log', name=sample, box_visible=True, line_color='black',
                                        meanline_visible=True, fillcolor=COLOURS[i], opacity=0.5,
                                        showlegend=showlegend, spanmode='hard'),
                              row=row_coord, col=col_coord)
        i+=1
    
    fig_per_sample.add_trace(go.Scatter(y=[pipeline_stats_df[column].mean()]*len(pipeline_stats_df['assembler']), 
                                    x=pipeline_stats_df['assembler'], mode='lines', name='mean',
                                    line=dict(color="crimson"), opacity=0.8, showlegend=showlegend),
                          row=row_coord, col=col_coord)
    if col_coord == 2:
        col_coord = 1
        row_coord += 1
    else:
        col_coord +=1

i=0
for sample in pipeline_stats_df['sample'].unique():
    fig_per_sample.add_trace(go.Violin(y=pipeline_stats_df['time_hours'][pipeline_stats_df['sample'] == sample],
                 x=pipeline_stats_df['assembler'][pipeline_stats_df['sample'] == sample],
                    box_visible=True, line_color='black',
                    meanline_visible=True, fillcolor=COLOURS[i], opacity=0.5,
                    showlegend=False, spanmode='hard'),  row=3, col=1)
    i+=1

fig_per_sample.add_trace(go.Scatter(y=[pipeline_stats_df['time_hours'].mean()]*len(global_pipeline_stats_df['assembler']), 
                              x=global_pipeline_stats_df['assembler'], mode='lines',
                              line=dict(color="crimson"), opacity=0.8, name='Mean', showlegend=False),  row=3, col=1)

fig_per_sample.update_layout(plot_bgcolor='rgb(255,255,255)', title_text="Computational Performance Metrics Distribution")
# grid
fig_per_sample['layout']['xaxis']['gridcolor']='#DCDCDC'
fig_per_sample['layout']['xaxis2']['gridcolor']='#DCDCDC'
fig_per_sample['layout']['xaxis3']['gridcolor']='#DCDCDC'
fig_per_sample['layout']['xaxis4']['gridcolor']='#DCDCDC'
fig_per_sample['layout']['xaxis5']['gridcolor']='#DCDCDC'
# y-axis legends
fig_per_sample['layout']['yaxis']['title']='CPU/hour'
fig_per_sample['layout']['yaxis2']['title']='GB'
fig_per_sample['layout']['yaxis3']['title']='GB'
fig_per_sample['layout']['yaxis4']['title']='GB'
fig_per_sample['layout']['yaxis5']['title']='Hours'

fig_per_sample.update_layout(violingap=0, violinmode='overlay')
fig_per_sample.update_xaxes(categoryorder='category ascending')

fig_per_sample.show()
plot(fig_per_sample, filename='Plots/Performance/Per Sample Computational Performance Metrics Distribution.html', auto_open=False)

'Plots/Performance/Per Sample Computational Performance Metrics Distribution.html'