# Block Aligner Visualizations

In [1]:
import altair as alt
import pandas as pd
from io import StringIO

## Uniclust 30 Accuracy

In [2]:
output = !cd .. && RUSTFLAGS="-C target-cpu=native" cargo run --example uc_accuracy --release --quiet
output

['# seq identity is lower bound (inclusive)',
 'dataset, max size, seq identity, count, wrong, wrong % error',
 'uc30_0.95, 32, 0, 0, 0, NaN',
 'uc30_0.95, 32, 0.1, 0, 0, NaN',
 'uc30_0.95, 32, 0.2, 14, 0, NaN',
 'uc30_0.95, 32, 0.3, 873, 48, 0.23644643580689115',
 'uc30_0.95, 32, 0.4, 1166, 72, 0.2588941794696058',
 'uc30_0.95, 32, 0.5, 951, 38, 0.26989686669054164',
 'uc30_0.95, 32, 0.6, 923, 29, 0.26600576925717145',
 'uc30_0.95, 32, 0.7, 789, 21, 0.2507672376509289',
 'uc30_0.95, 32, 0.8, 747, 10, 0.21092575017184195',
 'uc30_0.95, 32, 0.9, 1537, 20, 0.13902065073668682',
 '',
 '# total: 7000, wrong: 238, wrong % error: 0.24418420416118747, length avg: 329.554, length min: 22, length max: 8881',
 '',
 'uc30_0.95, 256, 0, 0, 0, NaN',
 'uc30_0.95, 256, 0.1, 0, 0, NaN',
 'uc30_0.95, 256, 0.2, 14, 0, NaN',
 'uc30_0.95, 256, 0.3, 873, 6, 0.022628943599678163',
 'uc30_0.95, 256, 0.4, 1166, 8, 0.07534822871256201',
 'uc30_0.95, 256, 0.5, 951, 4, 0.0327230834375036',
 'uc30_0.95, 256, 0.6,

In [3]:
out = StringIO("\n".join(output))
data = pd.read_csv(out, sep = "\\s*,\\s*", comment = "#", engine = "python")
data

Unnamed: 0,dataset,max size,seq identity,count,wrong,wrong % error
0,uc30_0.95,32,0.0,0,0,
1,uc30_0.95,32,0.1,0,0,
2,uc30_0.95,32,0.2,14,0,
3,uc30_0.95,32,0.3,873,48,0.236446
4,uc30_0.95,32,0.4,1166,72,0.258894
5,uc30_0.95,32,0.5,951,38,0.269897
6,uc30_0.95,32,0.6,923,29,0.266006
7,uc30_0.95,32,0.7,789,21,0.250767
8,uc30_0.95,32,0.8,747,10,0.210926
9,uc30_0.95,32,0.9,1537,20,0.139021


In [4]:
data["% wrong"] = data["wrong"] / data["count"]

In [5]:
alt.Chart(data, title = "Uniclust 30 Accuracy").mark_bar().encode(
    x = "seq identity:N",
    y = "% wrong",
    column = "max size:N",
    row = "dataset:N"
).properties(
    width = 150,
    height = 150
)

In [6]:
alt.Chart(data, title = "Uniclust 30 Wrong % Error").mark_bar().encode(
    x = "seq identity:N",
    y = "wrong % error",
    column = "max size:N",
    row = "dataset"
).properties(
    width = 150,
    height = 150
)

In [7]:
agg_data = data.copy()
agg_data = agg_data.groupby(["dataset", "max size"]).agg({"count": "sum", "wrong": "sum"}).reset_index()
agg_data["% wrong"] = agg_data["wrong"] / agg_data["count"]
agg_data

Unnamed: 0,dataset,max size,count,wrong,% wrong
0,uc30,32,7000,1294,0.184857
1,uc30,256,7000,182,0.026
2,uc30_0.95,32,7000,238,0.034
3,uc30_0.95,256,7000,34,0.004857


In [8]:
alt.Chart(agg_data, title = "Overall Uniclust 30 Accuracy").mark_bar().encode(
    x = alt.X("max size:N", axis = None),
    y = "% wrong",
    column = alt.Column("dataset", header = alt.Header(orient = "bottom")),
    color = alt.Color("max size:N")
).properties(
    width = 50,
    height = 100
)

## Random Accuracy