# Block Aligner Accuracy Analysis and Visualizations

This notebook contains code for collecting, cleaning, and analyzing data produced by block aligner's experiments.

To run this, you will need to install all the libraries imported below, along with [altair-saver](https://github.com/altair-viz/altair_saver) and [altair-data-server](https://github.com/altair-viz/altair_data_server), which has some extra dependencies for PDF saving.

Run each cell one by one to reproduce the experiments.

In [17]:
import altair as alt
from altair_saver import save
from altair import datum
import pandas as pd
from io import StringIO

alt.data_transformers.enable("data_server")

DataTransformerRegistry.enable('data_server')

In [18]:
def csv_to_pandas(csv, d = "\\s*,\\s*", t = None):
    s = StringIO("\n".join(csv))
    data = pd.read_csv(s, sep = d, thousands = t, comment = "#", engine = "python")
    return data

def file_to_pandas(path):
    return pd.read_csv(path, sep = "\\s*,\\s*", comment = "#", engine = "python")

## Block Aligner Image

In [3]:
!cd .. && cargo run --example block_img --release --features simd_avx2 --quiet -- vis/block_img1.png vis/block_img2.png

path: vis/block_img1.png, img size: 660 x 549
path: vis/block_img2.png, img size: 384 x 428


<img src = "block_img1.png" width = "300px" />
<img src = "block_img2.png" width = "300px" />

## Random Data Accuracy

In [4]:
output = !cd .. && cargo run --example accuracy --release --features simd_avx2 --quiet
output

['',
 'len, k, insert, iter, max size, wrong, wrong % error, wrong min, wrong max',
 '',
 '',
 '100, 10, false, 100, 32, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 10, false, 100, 2048, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 10, true, 100, 32, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 10, true, 100, 2048, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 20, false, 100, 32, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 20, false, 100, 2048, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 20, true, 100, 32, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 20, true, 100, 2048, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 50, false, 100, 32, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 50, false, 100, 2048, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 50, true, 100, 32, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '100, 50, true, 100, 2048, 0, NaN, 2147483647, -2147483648',
 '',
 '',
 '1000, 100, false, 100, 32, 0, NaN, 2147483647,

In [5]:
data = csv_to_pandas(output)
data

Unnamed: 0,len,k,insert,iter,max size,wrong,wrong % error,wrong min,wrong max
0,100,10,False,100,32,0,,2147483647,-2147483648
1,100,10,False,100,2048,0,,2147483647,-2147483648
2,100,10,True,100,32,0,,2147483647,-2147483648
3,100,10,True,100,2048,0,,2147483647,-2147483648
4,100,20,False,100,32,0,,2147483647,-2147483648
5,100,20,False,100,2048,0,,2147483647,-2147483648
6,100,20,True,100,32,0,,2147483647,-2147483648
7,100,20,True,100,2048,0,,2147483647,-2147483648
8,100,50,False,100,32,0,,2147483647,-2147483648
9,100,50,False,100,2048,0,,2147483647,-2147483648


In [6]:
data["% wrong"] = data["wrong"] / data["iter"]
data["k %"] = data["k"] / data["len"]

Error Rate on Random DNA Sequences with 10% Insert

In [7]:
c = alt.Chart(data).mark_point(opacity = 1, filled = True).encode(
    x = alt.X("% wrong", axis = alt.Axis(format = "%")),
    y = alt.Y("k %:N", axis = alt.Axis(format = "~%", grid = True)),
    color = "max size:N",
    shape = "max size:N",
    row = alt.Row("len:N", header = alt.Header(title = "length"))
).transform_filter(
    datum.insert == True
).properties(
    width = 100,
    height = 50
)
save(c, "random_dna_accuracy.pdf")
c

## Uniclust 30 Data Accuracy

In [18]:
output = !cd .. && cargo run --example uc_accuracy --release --features simd_avx2 --quiet
output

['# seq identity is lower bound (inclusive)',
 'dataset, size, seq identity, count, wrong, wrong % error',
 'uc30_0.95, 32-32, 0, 0, 0, NaN',
 'uc30_0.95, 32-32, 0.1, 0, 0, NaN',
 'uc30_0.95, 32-32, 0.2, 14, 0, NaN',
 'uc30_0.95, 32-32, 0.3, 873, 48, 0.23644643580689115',
 'uc30_0.95, 32-32, 0.4, 1166, 72, 0.2588941794696058',
 'uc30_0.95, 32-32, 0.5, 951, 38, 0.26989686669054164',
 'uc30_0.95, 32-32, 0.6, 923, 29, 0.26600576925717145',
 'uc30_0.95, 32-32, 0.7, 789, 21, 0.2507672376509289',
 'uc30_0.95, 32-32, 0.8, 747, 10, 0.21092575017184195',
 'uc30_0.95, 32-32, 0.9, 1537, 20, 0.13902065073668682',
 '',
 '# total: 7000, wrong: 238, wrong % error: 0.24418420416118747, length avg: 329.554, length min: 22, length max: 8881, dp fraction: 0.3389349961353417',
 '',
 'uc30_0.95, 32-256, 0, 0, 0, NaN',
 'uc30_0.95, 32-256, 0.1, 0, 0, NaN',
 'uc30_0.95, 32-256, 0.2, 14, 0, NaN',
 'uc30_0.95, 32-256, 0.3, 873, 6, 0.022628943599678163',
 'uc30_0.95, 32-256, 0.4, 1166, 8, 0.07534822871256201',


In [19]:
data = csv_to_pandas(output)
data

Unnamed: 0,dataset,size,seq identity,count,wrong,wrong % error
0,uc30_0.95,32-32,0.0,0,0,
1,uc30_0.95,32-32,0.1,0,0,
2,uc30_0.95,32-32,0.2,14,0,
3,uc30_0.95,32-32,0.3,873,48,0.236446
4,uc30_0.95,32-32,0.4,1166,72,0.258894
5,uc30_0.95,32-32,0.5,951,38,0.269897
6,uc30_0.95,32-32,0.6,923,29,0.266006
7,uc30_0.95,32-32,0.7,789,21,0.250767
8,uc30_0.95,32-32,0.8,747,10,0.210926
9,uc30_0.95,32-32,0.9,1537,20,0.139021


In [20]:
data["% wrong"] = data["wrong"] / data["count"]
data["seq identity"] = data["seq identity"].map({
    0.0: "0-10%",
    0.1: "10-20%",
    0.2: "20-30%",
    0.3: "30-40%",
    0.4: "40-50%",
    0.5: "50-60%",
    0.6: "60-70%",
    0.7: "70-80%",
    0.8: "80-90%",
    0.9: "90-100%"
})
data

Unnamed: 0,dataset,size,seq identity,count,wrong,wrong % error,% wrong
0,uc30_0.95,32-32,0-10%,0,0,,
1,uc30_0.95,32-32,10-20%,0,0,,
2,uc30_0.95,32-32,20-30%,14,0,,0.0
3,uc30_0.95,32-32,30-40%,873,48,0.236446,0.054983
4,uc30_0.95,32-32,40-50%,1166,72,0.258894,0.06175
5,uc30_0.95,32-32,50-60%,951,38,0.269897,0.039958
6,uc30_0.95,32-32,60-70%,923,29,0.266006,0.031419
7,uc30_0.95,32-32,70-80%,789,21,0.250767,0.026616
8,uc30_0.95,32-32,80-90%,747,10,0.210926,0.013387
9,uc30_0.95,32-32,90-100%,1537,20,0.139021,0.013012


Uniclust30 Error Rate

In [21]:
c = alt.Chart(data).mark_bar().encode(
    x = "seq identity",
    y = alt.Y("% wrong", axis = alt.Axis(format = "%")),
    column = alt.Column("size", sort = ["32-32", "32-256", "256-256"]),
    row = "dataset",
    color = alt.Color("size", legend = None, sort = ["32-32", "32-256", "256-256"])
).properties(
    width = 100,
    height = 100
)
save(c, "uniclust30_accuracy.pdf")
c

Uniclust30 % Error

In [22]:
c = alt.Chart(data).mark_bar().encode(
    x = "seq identity",
    y = alt.Y("wrong % error", axis = alt.Axis(format = "%")),
    column = alt.Column("size", sort = ["32-32", "32-256", "256-256"]),
    row = "dataset",
    color = alt.Color("size", legend = None, sort = ["32-32", "32-256", "256-256"])
).properties(
    width = 100,
    height = 100
)
save(c, "uniclust30_percent_error.pdf")
c

In [23]:
agg_data = data.copy()
agg_data = agg_data.groupby(["dataset", "size"]).agg({"count": "sum", "wrong": "sum"}).reset_index()
agg_data["% wrong"] = agg_data["wrong"] / agg_data["count"]
agg_data

Unnamed: 0,dataset,size,count,wrong,% wrong
0,uc30,256-256,7000,28,0.004
1,uc30,32-256,7000,182,0.026
2,uc30,32-32,7000,1294,0.184857
3,uc30_0.95,256-256,7000,4,0.000571
4,uc30_0.95,32-256,7000,34,0.004857
5,uc30_0.95,32-32,7000,238,0.034


Overall Uniclust30 Error Rate

In [24]:
c = alt.Chart(agg_data).mark_bar().encode(
    x = alt.X("size", axis = None, sort = ["32-32", "32-256", "256-256"]),
    y = alt.Y("% wrong", axis = alt.Axis(format = "%")),
    column = alt.Column("dataset", header = alt.Header(orient = "bottom")),
    color = alt.Color("size", sort = ["32-32", "32-256", "256-256"])
).properties(
    width = 50,
    height = 100
)
save(c, "uniclust30_overall_accuracy.pdf")
c

In [78]:
data = file_to_pandas("../data/uc_accuracy.csv")
data

Unnamed: 0,dataset,size,query len,reference len,seq id,pred score,true score
0,uc30_0.95,32-32,1944,1871,0.362095,1656,2909
1,uc30_0.95,32-32,804,808,0.363745,1184,1184
2,uc30_0.95,32-32,4242,4122,0.427032,6701,7639
3,uc30_0.95,32-32,230,232,0.400000,405,405
4,uc30_0.95,32-32,264,259,0.324528,390,390
...,...,...,...,...,...,...,...
41995,uc30,256-256,542,542,0.996310,2862,2862
41996,uc30,256-256,277,303,0.762376,1103,1103
41997,uc30,256-256,65,65,0.907692,307,307
41998,uc30,256-256,45,56,0.732143,195,195


Uniclust30 Our Score vs True Score (AVX2)

In [79]:
c = alt.Chart(data).mark_circle().encode(
    x = alt.X("true score", bin = alt.Bin(maxbins = 50)),
    y = alt.Y("pred score", bin = alt.Bin(maxbins = 50)),
    row = "dataset",
    column = alt.Column("size", header = alt.Header(orient = "bottom"), sort = ["32-32", "32-256", "256-256"]),
    color = alt.Color("count():Q", scale = alt.Scale(scheme = "greenblue"))
).properties(
    width = 200,
    height = 200
)
save(c, "uniclust30_scores.pdf")
c

## DNA Reads Global Alignment

In [29]:
output = !cd .. && cargo run --example nanopore_accuracy --release --features simd_avx2 --quiet
output

['',
 'dataset, size, total, wrong, wrong % error',
 '',
 'illumina, 32-32, 100000, 0, NaN',
 '',
 'nanopore 1kbp, 32-64, 12477, 377, 0.2014298964717687',
 '',
 'nanopore 25kbp, 32-256, 1734, 100, 0.08195990950830861',
 '# Done!']

In [30]:
data = csv_to_pandas(output)
data

Unnamed: 0,dataset,size,total,wrong,wrong % error
0,illumina,32-32,100000,0,
1,nanopore 1kbp,32-64,12477,377,0.20143
2,nanopore 25kbp,32-256,1734,100,0.08196


In [31]:
data["error rate"] = data["wrong"] / data["total"]
data = data[["dataset", "size", "total", "wrong", "error rate", "wrong % error"]]
data = data.fillna(0)
data = data.rename(columns = {"total": "reads"})
data["error rate"] = data["error rate"].map("{:.1%}".format)
data["wrong % error"] = data["wrong % error"].map("{:.1%}".format)
print(data)

          dataset    size   reads  wrong error rate wrong % error
0        illumina   32-32  100000      0       0.0%          0.0%
1   nanopore 1kbp   32-64   12477    377       3.0%         20.1%
2  nanopore 25kbp  32-256    1734    100       5.8%          8.2%


In [80]:
data = file_to_pandas("../data/nanopore_accuracy.csv")
data

Unnamed: 0,dataset,size,query len,reference len,pred score,true score
0,illumina,32-32,101,101,97,97
1,illumina,32-32,101,101,99,99
2,illumina,32-32,101,101,99,99
3,illumina,32-32,101,101,99,99
4,illumina,32-32,101,101,99,99
...,...,...,...,...,...,...
24886,illumina,32-32,101,101,101,101
24887,illumina,32-32,101,101,101,101
24888,illumina,32-32,101,101,101,101
24889,illumina,32-32,101,101,101,101


DNA Global Alignment Our Score vs True Score (AVX2)

In [81]:
c = alt.Chart(data).mark_circle().encode(
    x = alt.X("true score", bin = alt.Bin(maxbins = 50)),
    y = alt.Y("pred score", bin = alt.Bin(maxbins = 50)),
    row = "dataset",
    column = alt.Column("size", header = alt.Header(orient = "bottom"), sort = ["32-32", "32-256", "256-256"]),
    color = alt.Color("count():Q", scale = alt.Scale(scheme = "greenblue"))
).properties(
    width = 200,
    height = 200
)
save(c, "dna_scores.pdf")
c

## Nanopore Data Compare Setup

To run the comparisons below, you need to clone the following repos, place them in the same directory where this repo (block aligner) is located, and follow their setup instructions:
* [diff-bench-paper](https://github.com/Daniel-Liu-c0deb0t/diff-bench-paper)
* [adaptivebandbench](https://github.com/Daniel-Liu-c0deb0t/adaptivebandbench)

## Nanopore Data Compare

In [32]:
output = !cd ../../diff-bench-paper/supplementary_data/benchmark_codes && ./custom_scores.sh 2>&1 | grep '\.tsv'
output

['scores_l1000.tsv',
 'scores_l10000.tsv',
 'scores_l25000.tsv',
 'scores_default.tsv']

In [33]:
lengths = []
for f in output:
    l = f[len("scores_"):f.index(".")]
    lengths.append(l[1:] if l[0] == "l" else l)
lengths

['1000', '10000', '25000', 'default']

In [34]:
path_prefix = "../diff-bench-paper/"
outputs = []
for f in output:
    o = !cd .. && cargo run --example compare --release --features simd_avx2 --quiet -- {path_prefix + f} 50
    outputs.append(o)
outputs

[['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 16, 0.1168083944994962, 1649, 0.014457400441640959',
  '',
  '64, 1734, 2, 0.0035435779816513765, 1674, 0.01808772210169801',
  '# Done!'],
 ['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 112, 0.11158480489223006, 1558, 0.00231192171488459',
  '',
  '64, 1734, 10, 0.02004804219719002, 1681, 0.012649624148278275',
  '# Done!'],
 ['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 196, 0.12252339253065558, 906, 0.0033348114113708398',
  '',
  '64, 1734, 6, 0.06846053563562754, 1113, 0.029475568621044303',
  '# Done!'],
 ['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 220, 0.11254060955901618, 50, 0.050686760651400126',
  '',
  '64, 1734, 10, 0.04866354196777517, 241, 0.13733071706366937',
  '# Done!']]

In [35]:
data = []
for o in outputs:
    d = csv_to_pandas(o)
    data.append(d)
data = pd.concat(data, keys = lengths)
data = data.reset_index()
data = data.drop(columns = ["level_1"])
data = data.rename(columns = {"level_0": "length"})
data["band width"] = 32
data

Unnamed: 0,length,max size,total,other better,other % better,us better,us % better,band width
0,1000,32,1734,16,0.116808,1649,0.014457,32
1,1000,64,1734,2,0.003544,1674,0.018088,32
2,10000,32,1734,112,0.111585,1558,0.002312,32
3,10000,64,1734,10,0.020048,1681,0.01265,32
4,25000,32,1734,196,0.122523,906,0.003335,32
5,25000,64,1734,6,0.068461,1113,0.029476,32
6,default,32,1734,220,0.112541,50,0.050687,32
7,default,64,1734,10,0.048664,241,0.137331,32


In [36]:
output = !cd ../../adaptivebandbench && ./custom_scores.sh 2>&1 | grep '\.tsv'
output

['scores_l1000_b256.tsv',
 'scores_l10000_b256.tsv',
 'scores_l10000_b2048.tsv',
 'scores_b256.tsv',
 'scores_b2048.tsv']

In [37]:
lengths = []
band_widths = []
for f in output:
    l = f[len("scores_"):f.index(".")]
    if l[0] == "l":
        lengths.append(l[1:l.index("_")])
        l = l[l.index("_") + 1:]
    else:
        lengths.append("default")
    if l[0] == "b":
        band_widths.append(l[1:])
print(lengths)
print(band_widths)

['1000', '10000', '10000', 'default', 'default']
['256', '256', '2048', '256', '2048']


In [38]:
path_prefix = "../adaptivebandbench/"
outputs = []
for f in output:
    o = !cd .. && cargo run --example compare --release --features simd_avx2 --quiet -- {path_prefix + f} 100000
    outputs.append(o)
outputs

[['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 46, 0.16641503565322546, 0, NaN',
  '',
  '64, 1734, 4, 0.032013001111673656, 0, NaN',
  '# Done!'],
 ['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 133, 0.12986009851978048, 1591, 0.6174163601756474',
  '',
  '64, 1734, 104, 0.02947198288519112, 1614, 0.6196860925554909',
  '# Done!'],
 ['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 202, 0.160174374234026, 0, NaN',
  '',
  '64, 1734, 117, 0.042759699950194734, 0, NaN',
  '# Done!'],
 ['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 135, 0.12995297021591998, 1595, 0.8408498183780575',
  '',
  '64, 1734, 110, 0.0356590723709368, 1622, 0.8438482870550866',
  '# Done!'],
 ['max size, total, other better, other % better, us better, us % better',
  '',
  '32, 1734, 269, 0.19676765418388104, 382, 0.1471431574003461'

In [39]:
data2 = []
for o in outputs:
    d = csv_to_pandas(o)
    data2.append(d)
index = list(zip(lengths, band_widths))
data2 = pd.concat(data2, keys = index)
data2 = data2.reset_index()
data2 = data2.drop(columns = ["level_2"])
data2 = data2.rename(columns = {"level_0": "length", "level_1": "band width"})
data2

Unnamed: 0,length,band width,max size,total,other better,other % better,us better,us % better
0,1000,256,32,1734,46,0.166415,0,
1,1000,256,64,1734,4,0.032013,0,
2,10000,256,32,1734,133,0.12986,1591,0.617416
3,10000,256,64,1734,104,0.029472,1614,0.619686
4,10000,2048,32,1734,202,0.160174,0,
5,10000,2048,64,1734,117,0.04276,0,
6,default,256,32,1734,135,0.129953,1595,0.84085
7,default,256,64,1734,110,0.035659,1622,0.843848
8,default,2048,32,1734,269,0.196768,382,0.147143
9,default,2048,64,1734,136,0.057153,420,0.15565


In [40]:
data["other better %"] = data["other better"] / data["total"]
data["us better %"] = data["us better"] / data["total"]

data2["other better %"] = data2["other better"] / data2["total"]
data2["us better %"] = data2["us better"] / data2["total"]

In [41]:
data["equal %"] = 1.0 - data["other better %"] - data["us better %"]
data2["equal %"] = 1.0 - data2["other better %"] - data2["us better %"]

In [42]:
cleaned = data.copy()
cleaned = cleaned.melt(id_vars = ["length", "band width", "max size"], value_vars = ["us better %", "other better %", "equal %"])
cleaned["variable"] = cleaned["variable"].map({"us better %": "ours better %", "other better %": "adaptive banding better %", "equal %": "equal %"})

cleaned2 = data2.copy()
cleaned2 = cleaned2.melt(id_vars = ["length", "band width", "max size"], value_vars = ["us better %", "other better %", "equal %"])
cleaned2["variable"] = cleaned2["variable"].map({"us better %": "ours better %", "other better %": "static banding better %", "equal %": "equal %"})

In [43]:
order = {"ours better %": 0, "equal %": 1, "adaptive banding better %": 2}
cleaned["order"] = cleaned.apply(lambda r: order[r["variable"]], axis = 1)

order = {"ours better %": 0, "equal %": 1, "static banding better %": 2}
cleaned2["order"] = cleaned2.apply(lambda r: order[r["variable"]], axis = 1)

Comparison with Adaptive Banding

In [44]:
c = alt.Chart(cleaned).mark_bar().encode(
    x = "length",
    y = alt.Y("sum(value)", axis = alt.Axis(title = "", format = "%")),
    color = alt.Color("variable", title = "", sort = alt.EncodingSortField(field = "order")),
    row = "max size:N",
    order = "order"
).properties(
    width = 100,
    height = 100
)
save(c, "compare_adaptive_banding.pdf")
c

Comparison with Static Banding

In [45]:
c = alt.Chart(cleaned2).mark_bar().encode(
    x = "length",
    y = alt.Y("sum(value)", axis = alt.Axis(title = "", format = "%")),
    color = alt.Color("variable", title = "", sort = alt.EncodingSortField(field = "order")),
    row = alt.Row("max size:N", title = "max block size"),
    column = alt.Column("band width:N", title = "static band width", sort = ["256", "2048"]),
    order = "order"
).properties(
    width = 100,
    height = 100
)
save(c, "compare_diagonal.pdf")
c

## Sequence-to-Profile Alignment Accuracy

In [3]:
output = !cd .. && cargo run --example pssm_accuracy --release --features simd_avx2 --quiet
output

['size, correct',
 '32-32, 9634',
 '32-64, 10727',
 '32-128, 11083',
 '2048-2048, 11160',
 '# compared to 2048-2048',
 '# Done!']

In [4]:
data = csv_to_pandas(output)
data

Unnamed: 0,size,correct
0,32-32,9634
1,32-64,10727
2,32-128,11083
3,2048-2048,11160


In [5]:
data["% correct"] = data["correct"] / data.iloc[-1]["correct"]
data

Unnamed: 0,size,correct,% correct
0,32-32,9634,0.863262
1,32-64,10727,0.961201
2,32-128,11083,0.9931
3,2048-2048,11160,1.0


SCOP Sequence-to-Profile Alignment Accuracy (AVX2)

In [6]:
c = alt.Chart(data).mark_bar().encode(
    x = alt.X("size", sort = ["32-32", "32-64", "32-128", "2048-2048"]),
    y = alt.Y("% correct", axis = alt.Axis(format = "%")),
    color = alt.Color("size", sort = ["32-32", "32-64", "32-128", "2048-2048"], legend = None)
).properties(
    width = 60,
    height = 100
)
save(c, "pssm_accuracy.pdf")
c

In [28]:
data = file_to_pandas("../data/pssm_accuracy.csv")
data

Unnamed: 0,size,seq len,profile len,pred score,true score
0,32-32,116,116,439,439
1,32-64,116,116,439,439
2,32-128,116,116,439,439
3,2048-2048,116,116,439,439
4,32-32,127,127,531,531
...,...,...,...,...,...
44635,2048-2048,80,80,476,476
44636,32-32,56,56,331,331
44637,32-64,56,56,331,331
44638,32-128,56,56,331,331


SCOP Sequence-to-Profile Alignment Our Score vs True Score (AVX2)

In [77]:
c = alt.Chart(data).mark_circle().encode(
    x = alt.X("true score", bin = alt.Bin(maxbins = 50)),
    y = alt.Y("pred score", bin = alt.Bin(maxbins = 50)),
    column = alt.Column("size", header = alt.Header(orient = "bottom"), sort = ["32-32", "32-64", "32-128"]),
    color = alt.Color("count():Q", title = "count", scale = alt.Scale(scheme = "greenblue"))
).transform_filter(
    datum.size != "2048-2048"
).properties(
    width = 200,
    height = 200
)
save(c, "pssm_scores.pdf")
c