# Block Aligner Data Analysis and Visualizations

This notebook contains code for collecting, cleaning, and analyzing data produced by block aligner's experiments. Run each cell one by one to reproduce the experiments.

In [3]:
import altair as alt
from altair import datum
import pandas as pd
from io import StringIO

In [4]:
def csv_to_pandas(csv, d = "\\s*,\\s*", t = None):
    s = StringIO("\n".join(csv))
    data = pd.read_csv(s, sep = d, thousands = t, comment = "#", engine = "python")
    return data

## Block Aligner Image

In [3]:
!cd .. && RUSTFLAGS="-C target-cpu=native" cargo run --example block_img --release --quiet -- vis/block_img1.png vis/block_img2.png

path: vis/block_img1.png, img size: 1980 x 1647
path: vis/block_img2.png, img size: 1695 x 1746


<img src = "block_img1.png" />

<img src = "block_img2.png" />

## Random Accuracy

In [66]:
output = !cd .. && RUSTFLAGS="-C target-cpu=native" cargo run --example accuracy --release --quiet
output

['',
 'len, k, insert, iter, max size, wrong, wrong % error, wrong min, wrong max',
 '',
 '',
 '100, 10, false, 100, 32, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 10, false, 100, 2048, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 10, true, 100, 32, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 10, true, 100, 2048, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 20, false, 100, 32, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 20, false, 100, 2048, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 20, true, 100, 32, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 20, true, 100, 2048, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 50, false, 100, 32, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 50, false, 100, 2048, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 50, true, 100, 32, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 50, true, 100, 2048, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '1000, 100, false, 100, 32, 0, NaN, 2147483647,

In [67]:
data = csv_to_pandas(output)
data

Unnamed: 0,len,k,insert,iter,max size,wrong,wrong % error,wrong min,wrong max
0,100,10,False,100,32,0,,2147483647,-2147483648
1,100,10,False,100,2048,0,,2147483647,-2147483648
2,100,10,True,100,32,0,,2147483647,-2147483648
3,100,10,True,100,2048,0,,2147483647,-2147483648
4,100,20,False,100,32,0,,2147483647,-2147483648
5,100,20,False,100,2048,0,,2147483647,-2147483648
6,100,20,True,100,32,0,,2147483647,-2147483648
7,100,20,True,100,2048,0,,2147483647,-2147483648
8,100,50,False,100,32,0,,2147483647,-2147483648
9,100,50,False,100,2048,0,,2147483647,-2147483648


In [68]:
data["% wrong"] = data["wrong"] / data["iter"]
data["k %"] = data["k"] / data["len"]

In [72]:
alt.Chart(data, title = "Accuracy on Random DNA Sequences with 10% Insert").mark_point(opacity = 1).encode(
    x = alt.X("% wrong", axis = alt.Axis(format = "%")),
    y = alt.Y("k %:N", axis = alt.Axis(format = "~%", grid = True)),
    color = "max size:N",
    row = alt.Row("len:N", header = alt.Header(title = "length"))
).transform_filter(
    datum.insert == True
).properties(
    width = 100,
    height = 50
)

## Prefix Scan Benchmark

In [14]:
output = !cd .. && RUSTFLAGS="-C target-cpu=native" cargo bench --quiet -- prefix_scan | grep 'bench:' | awk '{print $2"\t"$5}'
output.insert(0, "algorithm\ttime")
output

['algorithm\ttime', 'bench_naive_prefix_scan\t10', 'bench_opt_prefix_scan\t1']

In [15]:
data = csv_to_pandas(output, d = "\t", t = ",")
data

Unnamed: 0,algorithm,time
0,bench_naive_prefix_scan,10
1,bench_opt_prefix_scan,1


In [16]:
data["algorithm"] = data["algorithm"].map({
    "bench_naive_prefix_scan": "naive",
    "bench_opt_prefix_scan": "ours"
})
data

Unnamed: 0,algorithm,time
0,naive,10
1,ours,1


In [17]:
alt.Chart(data, title = "Prefix Scan Benchmark").mark_bar().encode(
    x = alt.X("time", axis = alt.Axis(title = "time (ns)")),
    y = "algorithm"
).properties(
    width = 150
)

## Random Benchmark

In [52]:
output = !cd .. && RUSTFLAGS="-C target-cpu=native" cargo bench --quiet -- bench_ | grep 'bench:' | grep -v 'prefix_scan' | awk '{print $2"\t"$5}'
output

['bench_parasailors_aa_1000_10000\t45,440,605',
 'bench_parasailors_aa_100_1000\t505,330',
 'bench_parasailors_aa_10_100\t17,233',
 'bench_rustbio_aa_100_1000\t14,146,805',
 'bench_rustbio_aa_10_100\t142,802',
 'bench_scan_aa_1000_10000\t216,572',
 'bench_scan_aa_1000_10000_insert\t1,988,339',
 'bench_scan_aa_1000_10000_small\t212,003',
 'bench_scan_aa_1000_10000_trace\t336,675',
 'bench_scan_aa_100_1000\t22,497',
 'bench_scan_aa_100_1000_insert\t42,169',
 'bench_scan_aa_100_1000_small\t21,133',
 'bench_scan_aa_100_1000_trace\t38,474',
 'bench_scan_aa_10_100\t3,889',
 'bench_scan_aa_10_100_insert\t3,999',
 'bench_scan_aa_10_100_small\t3,087',
 'bench_scan_aa_10_100_trace\t5,566',
 'bench_scan_nuc_1000_10000\t211,609',
 'bench_scan_nuc_100_1000\t21,569',
 'bench_triple_accel_1000_10000\t6,926,855',
 'bench_triple_accel_100_1000\t22,149']

In [53]:
cleaned = ["algorithm\talphabet\tk\tlength\tproperty\ttime"]
names = ["parasailors_aa", "rustbio_aa", "scan_aa", "scan_nuc", "triple_accel"]
new_names = ["parasailors\tprotein", "rust bio\tprotein", "ours\tprotein", "ours\tnucleotide", "triple accel\tnucleotide"]

for o in output:
    o = o[len("bench_"):]
    for n, nn in zip(names, new_names):
        if o.startswith(n):
            suffix = o[len(n):].replace("_", "\t")
            o = nn + suffix
            break
    if len(o.split("\t")) < len(cleaned[0].split("\t")):
        insert_idx = o.rindex("\t")
        o = o[:insert_idx] + "\tdefault" + o[insert_idx:]
    cleaned.append(o)

cleaned

['algorithm\talphabet\tk\tlength\tproperty\ttime',
 'parasailors\tprotein\t1000\t10000\tdefault\t45,440,605',
 'parasailors\tprotein\t100\t1000\tdefault\t505,330',
 'parasailors\tprotein\t10\t100\tdefault\t17,233',
 'rust bio\tprotein\t100\t1000\tdefault\t14,146,805',
 'rust bio\tprotein\t10\t100\tdefault\t142,802',
 'ours\tprotein\t1000\t10000\tdefault\t216,572',
 'ours\tprotein\t1000\t10000\tinsert\t1,988,339',
 'ours\tprotein\t1000\t10000\tsmall\t212,003',
 'ours\tprotein\t1000\t10000\ttrace\t336,675',
 'ours\tprotein\t100\t1000\tdefault\t22,497',
 'ours\tprotein\t100\t1000\tinsert\t42,169',
 'ours\tprotein\t100\t1000\tsmall\t21,133',
 'ours\tprotein\t100\t1000\ttrace\t38,474',
 'ours\tprotein\t10\t100\tdefault\t3,889',
 'ours\tprotein\t10\t100\tinsert\t3,999',
 'ours\tprotein\t10\t100\tsmall\t3,087',
 'ours\tprotein\t10\t100\ttrace\t5,566',
 'ours\tnucleotide\t1000\t10000\tdefault\t211,609',
 'ours\tnucleotide\t100\t1000\tdefault\t21,569',
 'triple accel\tnucleotide\t1000\t10000\td

In [54]:
data = csv_to_pandas(cleaned, d = "\t", t = ",")
data

Unnamed: 0,algorithm,alphabet,k,length,property,time
0,parasailors,protein,1000,10000,default,45440605
1,parasailors,protein,100,1000,default,505330
2,parasailors,protein,10,100,default,17233
3,rust bio,protein,100,1000,default,14146805
4,rust bio,protein,10,100,default,142802
5,ours,protein,1000,10000,default,216572
6,ours,protein,1000,10000,insert,1988339
7,ours,protein,1000,10000,small,212003
8,ours,protein,1000,10000,trace,336675
9,ours,protein,100,1000,default,22497


In [55]:
data["algorithm property"] = data["algorithm"] + " " + data["property"]
data["time"] /= 1000

In [64]:
alt.Chart(data, title = "Random Protein Sequences Benchmark").mark_point(opacity = 1).encode(
    x = alt.X("time", axis = alt.Axis(title = "time (us)"), scale = alt.Scale(type = "log")),
    y = alt.Y("algorithm property", axis = alt.Axis(title = "algorithm", grid = True), sort = alt.EncodingSortField(field = "time")),
    color = "length:N"
).transform_filter(
    datum.alphabet == "protein"
).properties(
    width = 200,
    height = 150
)

In [65]:
alt.Chart(data, title = "Random DNA Sequences Benchmark").mark_point(opacity = 1).encode(
    x = alt.X("time", axis = alt.Axis(title = "time (us)"), scale = alt.Scale(type = "log")),
    y = alt.Y("algorithm property", axis = alt.Axis(title = "algorithm", grid = True), sort = alt.EncodingSortField(field = "time")),
    color = "length:N"
).transform_filter(
    datum.alphabet == "nucleotide"
).properties(
    width = 200,
    height = 50
)

## Uniclust 30 Accuracy

In [3]:
output = !cd .. && RUSTFLAGS="-C target-cpu=native" cargo run --example uc_accuracy --release --quiet
output

['# seq identity is lower bound (inclusive)',
 'dataset, max size, seq identity, count, wrong, wrong % error',
 'uc30_0.95, 32, 0, 0, 0, NaN',
 'uc30_0.95, 32, 0.1, 0, 0, NaN',
 'uc30_0.95, 32, 0.2, 14, 0, NaN',
 'uc30_0.95, 32, 0.3, 873, 48, 0.23644643580689115',
 'uc30_0.95, 32, 0.4, 1166, 72, 0.2588941794696058',
 'uc30_0.95, 32, 0.5, 951, 38, 0.26989686669054164',
 'uc30_0.95, 32, 0.6, 923, 29, 0.26600576925717145',
 'uc30_0.95, 32, 0.7, 789, 21, 0.2507672376509289',
 'uc30_0.95, 32, 0.8, 747, 10, 0.21092575017184195',
 'uc30_0.95, 32, 0.9, 1537, 20, 0.13902065073668682',
 '',
 '# total: 7000, wrong: 238, wrong % error: 0.24418420416118747, length avg: 329.554, length min: 22, length max: 8881',
 '',
 'uc30_0.95, 256, 0, 0, 0, NaN',
 'uc30_0.95, 256, 0.1, 0, 0, NaN',
 'uc30_0.95, 256, 0.2, 14, 0, NaN',
 'uc30_0.95, 256, 0.3, 873, 6, 0.022628943599678163',
 'uc30_0.95, 256, 0.4, 1166, 8, 0.07534822871256201',
 'uc30_0.95, 256, 0.5, 951, 4, 0.0327230834375036',
 'uc30_0.95, 256, 0.6,

In [4]:
data = csv_to_pandas(output)
data

Unnamed: 0,dataset,max size,seq identity,count,wrong,wrong % error
0,uc30_0.95,32,0.0,0,0,
1,uc30_0.95,32,0.1,0,0,
2,uc30_0.95,32,0.2,14,0,
3,uc30_0.95,32,0.3,873,48,0.236446
4,uc30_0.95,32,0.4,1166,72,0.258894
5,uc30_0.95,32,0.5,951,38,0.269897
6,uc30_0.95,32,0.6,923,29,0.266006
7,uc30_0.95,32,0.7,789,21,0.250767
8,uc30_0.95,32,0.8,747,10,0.210926
9,uc30_0.95,32,0.9,1537,20,0.139021


In [5]:
data["% wrong"] = data["wrong"] / data["count"]

In [6]:
alt.Chart(data, title = "Uniclust 30 Accuracy").mark_bar().encode(
    x = alt.X("seq identity:N", axis = alt.Axis(format = "~%")),
    y = alt.Y("% wrong", axis = alt.Axis(format = "%")),
    column = "max size:N",
    row = "dataset:N"
).properties(
    width = 150,
    height = 150
)

In [7]:
alt.Chart(data, title = "Uniclust 30 Wrong % Error").mark_bar().encode(
    x = alt.X("seq identity:N", axis = alt.Axis(format = "~%")),
    y = alt.Y("wrong % error", axis = alt.Axis(format = "%")),
    column = "max size:N",
    row = "dataset"
).properties(
    width = 150,
    height = 150
)

In [19]:
agg_data = data.copy()
agg_data = agg_data.groupby(["dataset", "max size"]).agg({"count": "sum", "wrong": "sum"}).reset_index()
agg_data["% wrong"] = agg_data["wrong"] / agg_data["count"]
agg_data

Unnamed: 0,dataset,max size,count,wrong,% wrong
0,uc30,32,7000,1294,0.184857
1,uc30,256,7000,182,0.026
2,uc30_0.95,32,7000,238,0.034
3,uc30_0.95,256,7000,34,0.004857


In [20]:
alt.Chart(agg_data, title = "Overall Uniclust 30 Accuracy").mark_bar().encode(
    x = alt.X("max size:N", axis = None),
    y = alt.Y("% wrong", axis = alt.Axis(format = "%")),
    column = alt.Column("dataset", header = alt.Header(orient = "bottom")),
    color = alt.Color("max size:N")
).properties(
    width = 50,
    height = 100
)

## Uniclust 30 Benchmark

In [21]:
output = !cd .. && RUSTFLAGS="-C target-cpu=native" cargo run --example uc_bench --release --quiet
output

['# time (s)',
 'algorithm, dataset, time',
 'ours, uc30, 0.148956412',
 'ours, uc30 0.95, 0.128219909',
 'parasail, uc30, 1.145408316',
 'parasail, uc30 0.95, 1.331269761']

In [22]:
data = csv_to_pandas(output)
data

Unnamed: 0,algorithm,dataset,time
0,ours,uc30,0.148956
1,ours,uc30 0.95,0.12822
2,parasail,uc30,1.145408
3,parasail,uc30 0.95,1.33127


In [23]:
alt.Chart(data, title = "Overall Uniclust 30 Speed").mark_bar().encode(
    x = alt.X("algorithm", axis = None),
    y = alt.Y("time", axis = alt.Axis(title = "time (s)")),
    column = alt.Column("dataset", header = alt.Header(orient = "bottom")),
    color = "algorithm"
).properties(
    width = 50,
    height = 100
)

## Setup

To run the comparisons and benchmarks below, you need to clone the following repos, place them in the same directory where this repo (block aligner) is located, and follow their setup instructions:
* [diff-bench-paper](https://github.com/Daniel-Liu-c0deb0t/diff-bench-paper)
* [adaptivebandbench](https://github.com/Daniel-Liu-c0deb0t/adaptivebandbench)

## Nanopore Compare

In [80]:
output = !cd ../../diff-bench-paper/supplementary_data/benchmark_codes && ./custom_scores.sh 2>&1 | grep '\.tsv'
output

['scores_l1000.tsv',
 'scores_l10000.tsv',
 'scores_l25000.tsv',
 'scores_default.tsv']

In [81]:
lengths = []
for f in output:
    l = f[len("scores_"):f.index(".")]
    lengths.append(l[1:] if l[0] == "l" else l)
lengths

['1000', '10000', '25000', 'default']

In [83]:
path_prefix = "../diff-bench-paper/"
outputs = []
for f in output:
    o = !cd .. && RUSTFLAGS="-C target-cpu=native" cargo run --example compare --release --quiet -- {path_prefix + f}
    outputs.append(o)
outputs

[['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 16, 0.1168083944994962, 1649, 0.014457400441640959',
  '',
  '64, 1734, 2, 0.0035435779816513765, 1674, 0.01808772210169801',
  '# Done!'],
 ['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 112, 0.11158480489223006, 1558, 0.00231192171488459',
  '',
  '64, 1734, 10, 0.02004804219719002, 1681, 0.01227283972235607',
  '# Done!'],
 ['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 196, 0.12252339253065558, 906, 0.0033348114113708398',
  '',
  '64, 1734, 6, 0.06846053563562754, 1113, 0.028906498881623614',
  '# Done!'],
 ['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 220, 0.11254060955901618, 50, 0.050686760651400126',
  '',
  '64, 1734, 10, 0.04866354196777517, 241, 0.13470260660734062',
  '# Done!']]

In [100]:
data = []
for o in outputs:
    d = csv_to_pandas(o)
    data.append(d)
data = pd.concat(data, keys = lengths)
data = data.reset_index()
data = data.drop(columns = ["level_1"])
data = data.rename(columns = {"level_0": "length"})
data["band width"] = 32
data

Unnamed: 0,length,max size,total,other better,other % better,us better,us % better,band width
0,1000,32,1734,16,0.116808,1649,0.014457,32
1,1000,64,1734,2,0.003544,1674,0.018088,32
2,10000,32,1734,112,0.111585,1558,0.002312,32
3,10000,64,1734,10,0.020048,1681,0.012273,32
4,25000,32,1734,196,0.122523,906,0.003335,32
5,25000,64,1734,6,0.068461,1113,0.028906,32
6,default,32,1734,220,0.112541,50,0.050687,32
7,default,64,1734,10,0.048664,241,0.134703,32


In [101]:
output = !cd ../../adaptivebandbench && ./custom_scores.sh 2>&1 | grep '\.tsv'
output

['scores_l1000_b256.tsv',
 'scores_l10000_b256.tsv',
 'scores_l10000_b2048.tsv',
 'scores_b256.tsv',
 'scores_b2048.tsv']

In [103]:
lengths = []
band_widths = []
for f in output:
    l = f[len("scores_"):f.index(".")]
    if l[0] == "l":
        lengths.append(l[1:l.index("_")])
        l = l[l.index("_") + 1:]
    else:
        lengths.append("default")
    if l[0] == "b":
        band_widths.append(l[1:])
print(lengths)
print(band_widths)

['1000', '10000', '10000', 'default', 'default']
['256', '256', '2048', '256', '2048']


In [104]:
path_prefix = "../adaptivebandbench/"
outputs = []
for f in output:
    o = !cd .. && RUSTFLAGS="-C target-cpu=native" cargo run --example compare --release --quiet -- {path_prefix + f}
    outputs.append(o)
outputs

[['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 46, 0.16641503565322546, 0, NaN',
  '',
  '64, 1734, 4, 0.032013001111673656, 0, NaN',
  '# Done!'],
 ['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 135, 0.15257955216457833, 1589, 0.618058455288836',
  '',
  '64, 1734, 104, 0.046379245670845254, 1614, 0.619612581289139',
  '# Done!'],
 ['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 202, 0.17373604648395774, 0, NaN',
  '',
  '64, 1734, 117, 0.05988456581435015, 0, NaN',
  '# Done!'],
 ['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 137, 0.1687138353530413, 1593, 0.8412070978900152',
  '',
  '64, 1734, 110, 0.06935021417881979, 1622, 0.8435908523540999',
  '# Done!'],
 ['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 269, 0.21942860528456848, 382, 0.1471431574003461'

In [107]:
data2 = []
for o in outputs:
    d = csv_to_pandas(o)
    data2.append(d)
index = list(zip(lengths, band_widths))
data2 = pd.concat(data2, keys = index)
data2 = data2.reset_index()
data2 = data2.drop(columns = ["level_2"])
data2 = data2.rename(columns = {"level_0": "length", "level_1": "band width"})
data2

Unnamed: 0,length,band width,max size,total,other better,other % better,us better,us % better
0,1000,256,32,1734,46,0.166415,0,
1,1000,256,64,1734,4,0.032013,0,
2,10000,256,32,1734,135,0.15258,1589,0.618058
3,10000,256,64,1734,104,0.046379,1614,0.619613
4,10000,2048,32,1734,202,0.173736,0,
5,10000,2048,64,1734,117,0.059885,0,
6,default,256,32,1734,137,0.168714,1593,0.841207
7,default,256,64,1734,110,0.06935,1622,0.843591
8,default,2048,32,1734,269,0.219429,382,0.147143
9,default,2048,64,1734,136,0.089012,420,0.15565


## Nanopore Benchmark

In [1]:
output = !cd .. && RUSTFLAGS="-C target-cpu=native" cargo run --example nanopore_bench --release --quiet
output

['# time (s)',
 'algorithm, dataset, time',
 'ours, nanopore, 1.155175678',
 'ours, random, 2.742632555']

In [17]:
data = csv_to_pandas(output)
data

Unnamed: 0,algorithm,dataset,time
0,ours,nanopore,1.155176
1,ours,random,2.742633


In [7]:
output2 = !cd ../../diff-bench-paper/supplementary_data/benchmark_codes && ./custom_bench.sh

for i, o in enumerate(output2):
    if o.startswith("cells("):
        break
output2 = output2[i + 1:]

output2.insert(0, "algorithm\tfill time\ttrace time\tconvert time\ttotal time\tscore\tfail")
output2

['algorithm\tfill time\ttrace time\tconvert time\ttotal time\tscore\tfail',
 'editdist\t469288000\t168382000\t66304000\t703974000\t6880489\t0',
 'non-diff\t666401000\t285294000\t60148000\t1011843000\t27124786\t52',
 'diff-raw\t610380000\t221439000\t62283000\t894102000\t27291141\t32',
 'libgaba\t434269000\t150610000\t31431000\t616310000\t27121546\t53',
 'edlib\t28158783000\t19313600000\t106577000\t47578960000\t37\t0',
 'seqan\t90648249000\t0\t0\t90648249000\t0\t0']

In [18]:
data2 = csv_to_pandas(output2, d = "\t")
data2

Unnamed: 0,algorithm,fill time,trace time,convert time,total time,score,fail
0,editdist,469288000,168382000,66304000,703974000,6880489,0
1,non-diff,666401000,285294000,60148000,1011843000,27124786,52
2,diff-raw,610380000,221439000,62283000,894102000,27291141,32
3,libgaba,434269000,150610000,31431000,616310000,27121546,53
4,edlib,28158783000,19313600000,106577000,47578960000,37,0
5,seqan,90648249000,0,0,90648249000,0,0


In [19]:
cleaned2 = data2.drop(columns = ["trace time", "convert time", "total time", "score", "fail"])
cleaned2 = cleaned2.rename(columns = {"fill time": "time"})
cleaned2["time"] /= 1e9
cleaned2

Unnamed: 0,algorithm,time
0,editdist,0.469288
1,non-diff,0.666401
2,diff-raw,0.61038
3,libgaba,0.434269
4,edlib,28.158783
5,seqan,90.648249


In [20]:
cleaned = data.drop(index = [1])
cleaned = cleaned.drop(columns = ["dataset"])
cleaned = cleaned.append(cleaned2, ignore_index = True)
cleaned

Unnamed: 0,algorithm,time
0,ours,1.155176
1,editdist,0.469288
2,non-diff,0.666401
3,diff-raw,0.61038
4,libgaba,0.434269
5,edlib,28.158783
6,seqan,90.648249


In [51]:
chart1 = alt.Chart(cleaned, title = "Nanopore Data Benchmark").mark_point(opacity = 1).encode(
    x = alt.X("time", axis = alt.Axis(title = "time (s)", grid = True), scale = alt.Scale(type = "log")),
    y = alt.Y("algorithm", axis = alt.Axis(grid = True), sort = alt.EncodingSortField(field = "time"))
).transform_filter(datum.algorithm != "ours")

chart2 = alt.Chart(cleaned).mark_point(color = "red").encode(
    x = alt.X("time", axis = alt.Axis(title = "time (s)", grid = True), scale = alt.Scale(type = "log")),
    y = alt.Y("algorithm", axis = alt.Axis(grid = True), sort = alt.EncodingSortField(field = "time"))
).transform_filter(datum.algorithm == "ours")

(chart1 + chart2).properties(
    width = 150,
    height = 150
)