# Block Aligner Benchmark Analysis and Visualizations

This notebook contains code for collecting, cleaning, and analyzing data produced by block aligner's experiments.

To run this, you will need to install all the libraries imported below, along with [altair-saver](https://github.com/altair-viz/altair_saver) and [altair-data-server](https://github.com/altair-viz/altair_data_server), which has some extra dependencies for PDF saving.

Run each cell one by one to reproduce the experiments. This may take a while. For accurate benchmarking, it is recommended to run the entire notebook in the command line with `nbconvert`.

In [6]:
import altair as alt
from altair_saver import save
from altair import datum
import pandas as pd
from io import StringIO

alt.data_transformers.enable("data_server")

DataTransformerRegistry.enable('data_server')

In [7]:
def csv_to_pandas(csv, d = "\\s*,\\s*", t = None):
    s = StringIO("\n".join(csv))
    data = pd.read_csv(s, sep = d, thousands = t, comment = "#", engine = "python")
    return data

## Prefix Scan Benchmark

In [8]:
output = !cd .. && cargo bench --features simd_avx2 --quiet -- prefix_scan | grep 'bench:' | awk '{print $2"\t"$5}'
output.insert(0, "algorithm\ttime")
output

['algorithm\ttime', 'bench_naive_prefix_scan\t27', 'bench_opt_prefix_scan\t10']

In [9]:
data = csv_to_pandas(output, d = "\t", t = ",")
data

Unnamed: 0,algorithm,time
0,bench_naive_prefix_scan,27
1,bench_opt_prefix_scan,10


In [10]:
data["algorithm"] = data["algorithm"].map({
    "bench_naive_prefix_scan": "naive",
    "bench_opt_prefix_scan": "ours"
})
data

Unnamed: 0,algorithm,time
0,naive,27
1,ours,10


Prefix Scan Benchmark (AVX2)

In [11]:
c = alt.Chart(data).mark_bar().encode(
    x = alt.X("time", axis = alt.Axis(title = "time (ns)")),
    y = "algorithm",
    color = alt.Color("algorithm", legend = None)
).properties(
    width = 150
)
save(c, "prefix_scan_bench.pdf")
c

## Random Data Benchmark

In [12]:
output = !cd .. && cargo bench --features simd_avx2 --quiet -- bench_ | grep 'bench:' | grep -v 'prefix_scan' | awk '{print $2"\t"$5}'
output

['bench_parasailors_aa_1000_10000\t45,210,361',
 'bench_parasailors_aa_100_1000\t486,281',
 'bench_parasailors_aa_10_100\t17,251',
 'bench_rustbio_aa_100_1000\t13,950,310',
 'bench_rustbio_aa_10_100\t142,445',
 'bench_scan_aa_1000_10000\t241,951',
 'bench_scan_aa_1000_10000_insert\t2,044,812',
 'bench_scan_aa_1000_10000_small\t214,650',
 'bench_scan_aa_1000_10000_trace\t333,265',
 'bench_scan_aa_100_1000\t24,002',
 'bench_scan_aa_100_1000_insert\t43,690',
 'bench_scan_aa_100_1000_small\t22,602',
 'bench_scan_aa_100_1000_trace\t37,233',
 'bench_scan_aa_10_100\t3,716',
 'bench_scan_aa_10_100_insert\t3,814',
 'bench_scan_aa_10_100_small\t3,281',
 'bench_scan_aa_10_100_trace\t5,695',
 'bench_scan_nuc_1000_10000\t209,985',
 'bench_scan_nuc_100_1000\t22,294',
 'bench_triple_accel_1000_10000\t7,503,465',
 'bench_triple_accel_100_1000\t23,812']

In [13]:
cleaned = ["algorithm\talphabet\tk\tlength\tproperty\ttime"]
names = ["parasailors_aa", "rustbio_aa", "scan_aa", "scan_nuc", "triple_accel"]
new_names = ["parasailors\tprotein", "rust bio\tprotein", "ours\tprotein", "ours\tnucleotide", "triple accel\tnucleotide"]

for o in output:
    o = o[len("bench_"):]
    for n, nn in zip(names, new_names):
        if o.startswith(n):
            suffix = o[len(n):].replace("_", "\t")
            o = nn + suffix
            break
    if len(o.split("\t")) < len(cleaned[0].split("\t")):
        insert_idx = o.rindex("\t")
        o = o[:insert_idx] + "\tdefault" + o[insert_idx:]
    cleaned.append(o)

cleaned

['algorithm\talphabet\tk\tlength\tproperty\ttime',
 'parasailors\tprotein\t1000\t10000\tdefault\t45,210,361',
 'parasailors\tprotein\t100\t1000\tdefault\t486,281',
 'parasailors\tprotein\t10\t100\tdefault\t17,251',
 'rust bio\tprotein\t100\t1000\tdefault\t13,950,310',
 'rust bio\tprotein\t10\t100\tdefault\t142,445',
 'ours\tprotein\t1000\t10000\tdefault\t241,951',
 'ours\tprotein\t1000\t10000\tinsert\t2,044,812',
 'ours\tprotein\t1000\t10000\tsmall\t214,650',
 'ours\tprotein\t1000\t10000\ttrace\t333,265',
 'ours\tprotein\t100\t1000\tdefault\t24,002',
 'ours\tprotein\t100\t1000\tinsert\t43,690',
 'ours\tprotein\t100\t1000\tsmall\t22,602',
 'ours\tprotein\t100\t1000\ttrace\t37,233',
 'ours\tprotein\t10\t100\tdefault\t3,716',
 'ours\tprotein\t10\t100\tinsert\t3,814',
 'ours\tprotein\t10\t100\tsmall\t3,281',
 'ours\tprotein\t10\t100\ttrace\t5,695',
 'ours\tnucleotide\t1000\t10000\tdefault\t209,985',
 'ours\tnucleotide\t100\t1000\tdefault\t22,294',
 'triple accel\tnucleotide\t1000\t10000\td

In [14]:
data = csv_to_pandas(cleaned, d = "\t", t = ",")
data

Unnamed: 0,algorithm,alphabet,k,length,property,time
0,parasailors,protein,1000,10000,default,45210361
1,parasailors,protein,100,1000,default,486281
2,parasailors,protein,10,100,default,17251
3,rust bio,protein,100,1000,default,13950310
4,rust bio,protein,10,100,default,142445
5,ours,protein,1000,10000,default,241951
6,ours,protein,1000,10000,insert,2044812
7,ours,protein,1000,10000,small,214650
8,ours,protein,1000,10000,trace,333265
9,ours,protein,100,1000,default,24002


In [15]:
data["algorithm property"] = data["algorithm"] + " " + data["property"]
data["time"] /= 1000

Random Protein Sequences Benchmark (AVX2)

In [16]:
c = alt.Chart(data).mark_point(opacity = 1, filled = True).encode(
    x = alt.X("time", axis = alt.Axis(title = "time (us)"), scale = alt.Scale(type = "log", domain = [1, 50000])),
    y = alt.Y("algorithm property", axis = alt.Axis(title = "algorithm", grid = True), sort = alt.EncodingSortField(field = "time")),
    color = "length:N",
    shape = "length:N"
).transform_filter(
    datum.alphabet == "protein"
).properties(
    width = 200,
    height = 150
)
save(c, "random_protein_bench.pdf")
c

Random DNA Sequences Benchmark (AVX2)

In [17]:
c = alt.Chart(data).mark_point(opacity = 1, filled = True).encode(
    x = alt.X("time", axis = alt.Axis(title = "time (us)"), scale = alt.Scale(type = "log", domain = [1, 50000])),
    y = alt.Y("algorithm property", axis = alt.Axis(title = "algorithm", grid = True), sort = alt.EncodingSortField(field = "time")),
    color = alt.Color("length:N", scale = alt.Scale(domain = [100, 1000, 10000])),
    shape = alt.Color("length:N", scale = alt.Scale(domain = [100, 1000, 10000]))
).transform_filter(
    datum.alphabet == "nucleotide"
).properties(
    width = 200,
    height = 50
)
save(c, "random_dna_bench.pdf")
c

## Uniclust 30 Data Benchmark

In [25]:
output = !cd .. && cargo run --example uc_bench --release --features simd_avx2 --quiet
output

['# time (s)',
 'algorithm, dataset, size, time',
 'ours (no trace), uc30, 32-32, 0.055815687',
 'ours (no trace), uc30 0.95, 32-32, 0.058619571',
 'ours (no trace), uc30, 32-256, 0.10054278',
 'ours (no trace), uc30 0.95, 32-256, 0.085375531',
 'ours (no trace), uc30, 256-256, 0.201964079',
 'ours (no trace), uc30 0.95, 256-256, 0.221768819',
 'ours (trace), uc30, 32-256, 0.152252701',
 'ours (trace), uc30 0.95, 32-256, 0.132771375',
 'parasail, uc30, full, 0.753212736',
 'parasail, uc30 0.95, full, 0.886418494']

In [26]:
data = csv_to_pandas(output)
data

Unnamed: 0,algorithm,dataset,size,time
0,ours (no trace),uc30,32-32,0.055816
1,ours (no trace),uc30 0.95,32-32,0.05862
2,ours (no trace),uc30,32-256,0.100543
3,ours (no trace),uc30 0.95,32-256,0.085376
4,ours (no trace),uc30,256-256,0.201964
5,ours (no trace),uc30 0.95,256-256,0.221769
6,ours (trace),uc30,32-256,0.152253
7,ours (trace),uc30 0.95,32-256,0.132771
8,parasail,uc30,full,0.753213
9,parasail,uc30 0.95,full,0.886418


Uniclust30 Benchmark (AVX2)

In [27]:
c = alt.Chart(data).mark_bar().encode(
    x = alt.X("algorithm", axis = None),
    y = alt.Y("time", axis = alt.Axis(title = "time (s)"), scale = alt.Scale(domain = [0.0, 1.0])),
    column = alt.Column("dataset", header = alt.Header(orient = "bottom")),
    color = "algorithm"
).transform_filter(
    (datum.size == "32-256") | (datum.algorithm == "parasail")
).properties(
    width = 50,
    height = 100
).configure_range(
    category = {"scheme": "dark2"}
)
save(c, "uniclust30_bench.pdf")
c

Uniclust30 Block Size Benchmark (AVX2)

In [28]:
c = alt.Chart(data).mark_bar().encode(
    x = alt.X("size", axis = None, sort = ["32-32", "32-256", "256-256"]),
    y = alt.Y("time", axis = alt.Axis(title = "time (s)"), scale = alt.Scale(domain = [0.0, 1.0])),
    column = alt.Column("dataset", header = alt.Header(orient = "bottom")),
    color = alt.Color("size", sort = ["32-32", "32-256", "256-256"])
).transform_filter(
    datum.algorithm == "ours (no trace)"
).properties(
    width = 50,
    height = 100
)
save(c, "uniclust30_size_bench.pdf")
c

## Nanopore Data Benchmark Setup

To run the benchmarks below, you need to clone the following repos, place them in the same directory where this repo (block aligner) is located, and follow their setup instructions:
* [diff-bench-paper](https://github.com/Daniel-Liu-c0deb0t/diff-bench-paper)
* [adaptivebandbench](https://github.com/Daniel-Liu-c0deb0t/adaptivebandbench)

## Nanopore Data Benchmark

In [46]:
output = !cd .. && cargo run --example nanopore_bench --release --features simd_avx2 --quiet
output

['# time (s)',
 'algorithm, dataset, time',
 'ours (no trace 32-32), nanopore 25kbp, 0.944833438',
 'ours (no trace 32-32), random, 2.3201892920000002',
 'ours (trace 32-32), nanopore 25kbp, 1.13281743',
 'ours (trace 32-32), random, 2.779769449',
 'ours (trace 32-64), nanopore 25kbp, 1.9827694980000001',
 'ours (trace 32-64), random, 2.862142031']

In [47]:
data = csv_to_pandas(output)
data

Unnamed: 0,algorithm,dataset,time
0,ours (no trace 32-32),nanopore 25kbp,0.944833
1,ours (no trace 32-32),random,2.320189
2,ours (trace 32-32),nanopore 25kbp,1.132817
3,ours (trace 32-32),random,2.779769
4,ours (trace 32-64),nanopore 25kbp,1.982769
5,ours (trace 32-64),random,2.862142


In [48]:
output2 = !cd ../../diff-bench-paper/supplementary_data/benchmark_codes && ./custom_bench.sh

for i, o in enumerate(output2):
    if o.startswith("cells("):
        break
output2 = output2[i + 1:]

output2.insert(0, "algorithm\tfill time\ttrace time\tconvert time\ttotal time\tscore\tfail")
output2

['algorithm\tfill time\ttrace time\tconvert time\ttotal time\tscore\tfail',
 'editdist\t470003000\t170070000\t66831000\t706904000\t6880489\t0',
 'non-diff\t691593000\t269894000\t60201000\t1021688000\t27124786\t52',
 'diff-raw\t609982000\t211468000\t62309000\t883759000\t27291141\t32',
 'libgaba\t433162000\t150139000\t31460000\t614761000\t27121546\t53',
 'edlib\t28082718000\t19235004000\t106382000\t47424104000\t37\t0',
 'seqan\t90278758000\t0\t0\t90278758000\t0\t0']

In [49]:
data2 = csv_to_pandas(output2, d = "\t")
data2

Unnamed: 0,algorithm,fill time,trace time,convert time,total time,score,fail
0,editdist,470003000,170070000,66831000,706904000,6880489,0
1,non-diff,691593000,269894000,60201000,1021688000,27124786,52
2,diff-raw,609982000,211468000,62309000,883759000,27291141,32
3,libgaba,433162000,150139000,31460000,614761000,27121546,53
4,edlib,28082718000,19235004000,106382000,47424104000,37,0
5,seqan,90278758000,0,0,90278758000,0,0


In [50]:
cleaned2 = data2.drop(columns = ["trace time", "convert time", "total time", "score", "fail"])
cleaned2 = cleaned2.rename(columns = {"fill time": "time"})
cleaned2["time"] /= 1e9
cleaned2

Unnamed: 0,algorithm,time
0,editdist,0.470003
1,non-diff,0.691593
2,diff-raw,0.609982
3,libgaba,0.433162
4,edlib,28.082718
5,seqan,90.278758


In [51]:
cleaned = data.drop(index = [1, 3, 5])
cleaned = cleaned.drop(columns = ["dataset"])
cleaned = cleaned.append(cleaned2, ignore_index = True)
cleaned

Unnamed: 0,algorithm,time
0,ours (no trace 32-32),0.944833
1,ours (trace 32-32),1.132817
2,ours (trace 32-64),1.982769
3,editdist,0.470003
4,non-diff,0.691593
5,diff-raw,0.609982
6,libgaba,0.433162
7,edlib,28.082718
8,seqan,90.278758


25kbp Nanopore Reads Benchmark (AVX2)

In [52]:
chart1 = alt.Chart(cleaned).mark_point(opacity = 1, filled = True).encode(
    x = alt.X("time", axis = alt.Axis(title = "time (s)", grid = True), scale = alt.Scale(type = "log")),
    y = alt.Y("algorithm", axis = alt.Axis(grid = True), sort = alt.EncodingSortField(field = "time"))
).transform_filter((datum.algorithm != "ours (trace 32-32)") & (datum.algorithm != "ours (no trace 32-32)") & (datum.algorithm != "ours (trace 32-64)"))

chart2 = alt.Chart(cleaned).mark_point(color = "red", filled = True).encode(
    x = alt.X("time", axis = alt.Axis(title = "time (s)", grid = True), scale = alt.Scale(type = "log")),
    y = alt.Y("algorithm", axis = alt.Axis(grid = True), sort = alt.EncodingSortField(field = "time"))
).transform_filter((datum.algorithm == "ours (trace 32-32)") | (datum.algorithm == "ours (no trace 32-32)") | (datum.algorithm == "ours (trace 32-64)"))

c = (chart1 + chart2).properties(
    width = 150,
    height = 150
)
save(c, "nanopore_bench.pdf")
c

## Sequence-to-Profile Alignment Benchmark

In [3]:
output = !cd .. && cargo run --example pssm_bench --release --features simd_avx2 --quiet
output

['size, time',
 '32-32, 0.16301056',
 '32-64, 0.184093758',
 '32-128, 0.216618957',
 '2048-2048, 4.707142767',
 '# Done!']

In [4]:
data = csv_to_pandas(output)
data

Unnamed: 0,size,time
0,32-32,0.163011
1,32-64,0.184094
2,32-128,0.216619
3,2048-2048,4.707143


SCOP Sequence-to-Profile Alignment Benchmark (AVX2)

In [5]:
c = alt.Chart(data).mark_bar().encode(
    x = alt.X("size", sort = ["32-32", "32-64", "32-128", "2048-2048"]),
    y = alt.Y("time", axis = alt.Axis(title = "time (s)")),
    color = alt.Color("size", sort = ["32-32", "32-64", "32-128", "2048-2048"], legend = None)
).properties(
    width = 60,
    height = 100
)
save(c, "pssm_size_bench.pdf")
c

## WASM SIMD

[Wasmtime](https://wasmtime.dev/) is needed to run the webassembly code.

In [53]:
output = !CARGO_TARGET_WASM32_WASI_RUNNER="wasmtime --wasm-features simd --" cargo bench --target=wasm32-wasi --features simd_wasm --quiet -- --nocapture | grep 'bench:' | awk '{print $2"\t"$5}'
output

['bench_rustbio_aa_100_1000\t22,736,334',
 'bench_rustbio_aa_10_100\t234,868',
 'bench_scan_aa_1000_10000\t1,758,351',
 'bench_scan_aa_1000_10000_insert\t20,722,116',
 'bench_scan_aa_1000_10000_small\t664,311',
 'bench_scan_aa_1000_10000_trace\t2,313,101',
 'bench_scan_aa_100_1000\t108,170',
 'bench_scan_aa_100_1000_insert\t244,786',
 'bench_scan_aa_100_1000_small\t66,446',
 'bench_scan_aa_100_1000_trace\t145,701',
 'bench_scan_aa_10_100\t8,089',
 'bench_scan_aa_10_100_insert\t8,571',
 'bench_scan_aa_10_100_small\t5,874',
 'bench_scan_aa_10_100_trace\t10,429',
 'bench_scan_nuc_1000_10000\t619,599',
 'bench_scan_nuc_100_1000\t62,778']

In [54]:
cleaned = ["algorithm\talphabet\tk\tlength\tproperty\ttime"]
names = ["rustbio_aa", "scan_aa", "scan_nuc"]
new_names = ["rust bio\tprotein", "ours\tprotein", "ours\tnucleotide"]

for o in output:
    o = o[len("bench_"):]
    for n, nn in zip(names, new_names):
        if o.startswith(n):
            suffix = o[len(n):].replace("_", "\t")
            o = nn + suffix
            break
    if len(o.split("\t")) < len(cleaned[0].split("\t")):
        insert_idx = o.rindex("\t")
        o = o[:insert_idx] + "\tdefault" + o[insert_idx:]
    cleaned.append(o)

cleaned

['algorithm\talphabet\tk\tlength\tproperty\ttime',
 'rust bio\tprotein\t100\t1000\tdefault\t22,736,334',
 'rust bio\tprotein\t10\t100\tdefault\t234,868',
 'ours\tprotein\t1000\t10000\tdefault\t1,758,351',
 'ours\tprotein\t1000\t10000\tinsert\t20,722,116',
 'ours\tprotein\t1000\t10000\tsmall\t664,311',
 'ours\tprotein\t1000\t10000\ttrace\t2,313,101',
 'ours\tprotein\t100\t1000\tdefault\t108,170',
 'ours\tprotein\t100\t1000\tinsert\t244,786',
 'ours\tprotein\t100\t1000\tsmall\t66,446',
 'ours\tprotein\t100\t1000\ttrace\t145,701',
 'ours\tprotein\t10\t100\tdefault\t8,089',
 'ours\tprotein\t10\t100\tinsert\t8,571',
 'ours\tprotein\t10\t100\tsmall\t5,874',
 'ours\tprotein\t10\t100\ttrace\t10,429',
 'ours\tnucleotide\t1000\t10000\tdefault\t619,599',
 'ours\tnucleotide\t100\t1000\tdefault\t62,778']

In [55]:
data = csv_to_pandas(cleaned, d = "\t", t = ",")
data

Unnamed: 0,algorithm,alphabet,k,length,property,time
0,rust bio,protein,100,1000,default,22736334
1,rust bio,protein,10,100,default,234868
2,ours,protein,1000,10000,default,1758351
3,ours,protein,1000,10000,insert,20722116
4,ours,protein,1000,10000,small,664311
5,ours,protein,1000,10000,trace,2313101
6,ours,protein,100,1000,default,108170
7,ours,protein,100,1000,insert,244786
8,ours,protein,100,1000,small,66446
9,ours,protein,100,1000,trace,145701


In [56]:
data["algorithm property"] = data["algorithm"] + " " + data["property"]
data["time"] /= 1000

Random Protein Sequences Benchmark (WASM SIMD)

In [57]:
c = alt.Chart(data).mark_point(opacity = 1, filled = True).encode(
    x = alt.X("time", axis = alt.Axis(title = "time (us)"), scale = alt.Scale(type = "log")),
    y = alt.Y("algorithm property", axis = alt.Axis(title = "algorithm", grid = True), sort = alt.EncodingSortField(field = "time")),
    color = "length:N",
    shape = "length:N"
).transform_filter(
    datum.alphabet == "protein"
).properties(
    width = 200,
    height = 150
)
save(c, "random_protein_bench_wasm.pdf")
c