# benchmarks of different bam to fastq tools


## Imports / Constants


In [271]:
import re
from math import ceil, floor

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import pyarrow.feather as feather
from hurry.filesize import alternative, size, verbose

CUD_COLORS = (
    "#e69f00",  # orange
    "#56b4e9",  # sky-blue
    "#009e73",  # bluish-green
    "#f0e442",  # yellow
    "#0072b2",  # blue
    "#d55e00",  # vermilion
    "#cc79a7",  # reddish-purple
)


def readable_size(input) -> str:
    return re.sub(
        r"(\d+)\s+(\w*)bytes*", r"\\SI{\1}{\\\2\\byte}", size(input, system=verbose)
    )


BYTE_DESCRIBE_MAP = {
    "count": "{:.0f}",
    "mean": readable_size,
    "std": readable_size,
    "min": readable_size,
    "25%": readable_size,
    "50%": readable_size,
    "75%": readable_size,
    "max": readable_size,
}


## Initial data parsing


In [272]:
df = feather.read_feather("bam2fastqbenchmarks/results.df")
df_grouped = df.groupby(df["run"])["start"].min().reset_index()
df_grouped = df_grouped.merge(df.groupby(df["run"])["complete"].max().reset_index())
df_grouped["duration"] = (df_grouped["complete"] - df_grouped["start"]) / 60 / 1000
df_grouped = df_grouped.merge(df.groupby(df["run"])["rss"].max().reset_index())
df_grouped["rss_readable"] = df_grouped["rss"].apply(
    lambda x: size(x, system=alternative)
)
df_grouped = df_grouped.merge(df.groupby(df["run"])["type"].first().reset_index())
df_grouped = df_grouped.merge(df.groupby(df["run"])["read_bytes"].sum().reset_index())
df_grouped = df_grouped.merge(df.groupby(df["run"])["write_bytes"].sum().reset_index())

df_grouped.replace(
    {
        "samtoolsmultithread": "samtools multithread",
        "samtoolssinglethread": "samtools singlethread",
    },
    inplace=True,
)
type_order = df_grouped.sort_values("duration", ascending=False)["type"].unique()

type_to_color = {}
i = 1
for tooltype in type_order:
    type_to_color[tooltype] = CUD_COLORS[len(type_order)-i]
    i += 1


## Wirte stats to file


In [273]:
from glob import escape


with open("stats.txt", "w") as statsfile:

    df_grouped.groupby("type")["duration"].describe().reset_index().drop(
        columns=["count"]
    ).style.format_index(axis=1, escape="latex").format(precision=2).hide().to_latex(
        statsfile,
        position="hp",
        position_float="centering",
        hrules=True,
        caption="Duration of BAM to FastQ conversion by tool in minutes",
        label="table:bam2fastduration",
    )

    df_grouped.groupby("type")["rss"].describe().reset_index().drop(
        columns=["count"]
    ).style.format_index(axis=1, escape="latex").format(
        BYTE_DESCRIBE_MAP
    ).hide().to_latex(
        statsfile,
        position="hp",
        position_float="centering",
        hrules=True,
        caption="Memory usage of BAM to FastQ conversion by tool",
        label="table:bam2fastqmemory",
    )

    df_grouped.groupby("type")["read_bytes"].describe().reset_index().drop(
        columns=["count"]
    ).style.format_index(axis=1, escape="latex").format(
        BYTE_DESCRIBE_MAP
    ).hide().to_latex(
        statsfile,
        position="hp",
        position_float="centering",
        hrules=True,
        caption="Amount of data read by BAM to FastQ conversion by tool",
        label="table:bam2fastqioread",
    )

    df_grouped.groupby("type")["write_bytes"].describe().reset_index().drop(
        columns=["count"]
    ).style.format_index(axis=1, escape="latex").format(
        BYTE_DESCRIBE_MAP
    ).hide().to_latex(
        statsfile,
        position="hp",
        position_float="centering",
        hrules=True,
        caption="Amount of data written by BAM to FastQ conversion by tool",
        label="table:bam2fastqiowrite",
    )


## Plot


In [274]:
figure_runtime = go.Figure()

color_counter = 0
for run_type in type_order:
    figure_runtime.add_trace(
        go.Bar(
            name=run_type,
            text=run_type,
            textposition="outside",
            cliponaxis=False,
            y=[run_type],
            x=[df_grouped[df_grouped["type"] == run_type]["duration"].median()],
            orientation="h",
            marker_color=type_to_color[run_type],
        )
    )
    color_counter += 1

figure_runtime.update_layout(
    dict(
        width=570,
        height=215,
        margin=dict(l=20, r=20, t=20, b=20),
        template="plotly_white",
        font=dict(family="Arial", color="#000000", size=10),
        showlegend=False,
        xaxis_title="Runtime in minutes",
    )
)
figure_runtime.update_xaxes(
    zeroline=True,
    range=[
        0,
        ceil(df_grouped["duration"].max()) + 4,
    ],
    title_font=dict(family="Arial", color="#000000", size=12),
    minor_ticks="outside",
    minor_tickcolor="darkgrey",
    showline=True,
    linecolor="darkgrey",
    gridcolor="darkgrey",
)
figure_runtime.update_yaxes(
    tickangle=270,
    showticklabels=False,
    showline=True,
    linecolor="darkgrey",
)
figure_runtime.write_image("bam2fastqbenchmark_runtime.pdf")

figure_runtime.update_layout(
    dict(
        width=1024,
        height=384,
    )
)
figure_runtime.show()


In [275]:
figure_memory = go.Figure()

color_counter = 0
for run_type in type_order:
    figure_memory.add_trace(
        go.Bar(
            name=run_type,
            text=run_type,
            textposition="outside",
            cliponaxis=False,
            y=[run_type],
            x=[df_grouped[df_grouped["type"] == run_type]["rss"].median()],
            orientation="h",
            marker_color=type_to_color[run_type],
        )
    )
    color_counter += 1

figure_memory.update_layout(
    dict(
        width=570,
        height=215,
        margin=dict(l=20, r=20, t=20, b=20),
        template="plotly_white",
        font=dict(family="Arial", color="#000000", size=10),
        showlegend=False,
        xaxis_title="Memory usage (log)",
    )
)
figure_memory.update_xaxes(
    zeroline=True,
    range=[
        6,
        np.log10(ceil(df_grouped["rss"].max())) * 1.05,
    ],
    tickformat="~s",
    title_font=dict(family="Arial", color="#000000", size=12),
    type="log",
    showline=True,
    linecolor="darkgrey",
    gridcolor="darkgrey",
    minor_ticks="outside",
    minor_tickcolor="darkgrey",
)
figure_memory.update_yaxes(
    tickangle=270,
    showticklabels=False,
)
figure_memory.write_image("bam2fastqbenchmark_memory.pdf")

figure_memory.update_layout(
    dict(
        width=1024,
        height=384,
    )
)
figure_memory.show()


In [276]:
figure_io = go.Figure()

color_counter = 0
for run_type in type_order:
    figure_io.add_trace(
        go.Bar(
            name=run_type,
            text=run_type,
            textposition="outside",
            cliponaxis=False,
            y=["write", "read"],
            x=[
                df_grouped[df_grouped["type"] == run_type]["write_bytes"].median(),
                df_grouped[df_grouped["type"] == run_type]["read_bytes"].median(),
            ],
            orientation="h",
            marker_color=type_to_color[run_type],
        )
    )
    color_counter += 1

figure_io.update_layout(
    dict(
        width=570,
        height=215,
        margin=dict(l=20, r=40, t=20, b=20),
        template="plotly_white",
        font=dict(family="Arial", color="#000000", size=10),
        showlegend=False,
        xaxis_title="Disk usage",
    )
)
figure_io.update_xaxes(
    zeroline=True,
    tickformat="~s",
    title_font=dict(family="Arial", color="#000000", size=12),
    showline=True,
    linecolor="darkgrey",
    gridcolor="darkgrey",
    minor_ticks="outside",
    minor_tickcolor="darkgrey",
)
figure_io.update_yaxes(
    tickangle=270,
    showline=True,
    linecolor="darkgrey",
)
figure_io.write_image("bam2fastqbenchmark_io.pdf")

figure_io.update_layout(
    dict(
        width=1024,
        height=384,
    )
)
figure_io.show()
