In [None]:
from evaluate_shapes import (
    read_json,
    add_indices,
    get_scores,
    plot_accuracy_vs_threshold,
    check_labels,
    plot_disagreements,
    plot_label_counts,
    fix_labels,
    set_consensus_labels,
    calculate_num_correct,
    get_best_thresholds,
    check_same_curves,
)

import numpy as np
import altair as alt

In [None]:
# load unlabeled curves
curves_original = read_json("small-pdps.json")

In [None]:
# read labeled PDPs from JSON file
curves_a = add_indices(read_json("dan-small-labeled-pdps.json"))
print(f"{len(curves_a)} curves")

# check that the labeled curves match the unlabeled
check_same_curves(curves_a, curves_original)

# get the user's shape labels as a list
labels_a = [x["shape"] for x in curves_a]

# for each threshold, get the heuristic's labels and calculate
# the accuracy of the user's labels wrt the heuristic's
df_a = get_scores(curves_a)
display(get_best_thresholds(df_a))

# get a line chart that shows the accuracy of the
# user's labels vs. the threshold
plot_a = plot_accuracy_vs_threshold(df_a)
display(plot_a)

# check for mistakes in labels
bad_labels_a = check_labels(
    curves_a, df_a[df_a["threshold"] == 0]["labels"].to_numpy()[0]
)
print("possible mistakes:")
print(bad_labels_a)

In [None]:
# read labeled PDPs from JSON file
curves_b = add_indices(read_json("enrico-small-labeled-pdps.json"))
print(f"{len(curves_b)} curves")

# check that the labeled curves match the unlabeled
check_same_curves(curves_b, curves_original)

# get the user's shape labels as a list
labels_b = [x["shape"] for x in curves_b]

# for each threshold, get the heuristic's labels and calculate
# the accuracy of the user's labels wrt the heuristic's
df_b = get_scores(curves_b)
display(get_best_thresholds(df_b))

# get a line chart that shows the accuracy of the
# user's labels vs. the threshold
plot_b = plot_accuracy_vs_threshold(df_b)
display(plot_b)

# check for mistakes in labels
bad_labels_b = check_labels(
    curves_b, df_b[df_b["threshold"] == 0]["labels"].to_numpy()[0]
)
print("possible mistakes:")
bad_labels_b

In [None]:
calculate_num_correct(labels_a, labels_b)

In [None]:
# plot the identified mistakes
plot_disagreements(
    [curves_b[x["index"]] for x in bad_labels_b],
    [x["user_label"] for x in bad_labels_b],
    [x["heuristic_label"] for x in bad_labels_b],
    "User",
    "Heuristic",
)

In [None]:
# the first case does not look like a mistake, but the rest do
fix_labels(curves_b, bad_labels_b[1:])

# update based on corrected labels

labels_b = [x["shape"] for x in curves_b]

df_b = get_scores(curves_b)
display(get_best_thresholds(df_b))

plot_b = plot_accuracy_vs_threshold(df_b)
display(plot_b)

# check that only the first case from above is still identified
bad_labels_b = check_labels(
    curves_b, df_b[df_b["threshold"] == 0]["labels"].to_numpy()[0]
)
print("possible mistakes:")
bad_labels_b

In [None]:
# check that the same PDPs were labeled
check_same_curves(curves_a, curves_b)

In [None]:
plot_label_counts(labels_a, labels_b)

In [None]:
plot_disagreements(curves_a, labels_a, labels_b, "User A", "User B")

In [None]:
calculate_num_correct(labels_a, labels_b)

In [None]:
calculate_num_correct(labels_a, labels_b) / len(curves_a)

Consensus labels:

- 3 mixed
- 23 mixed
- 33 decreasing
- 36 decreasing
- 48 mixed
- 49 mixed
- 52 mixed
- 56 increasing
- 65 increasing
- 71 decreasing
- 79 mixed
- 87 mixed
- 92 increasing
- 102 mixed
- 111 increasing
- 129 increasing


In [None]:
corrections = [
    (3, "mixed"),
    (23, "mixed"),
    (33, "decreasing"),
    (36, "decreasing"),
    (48, "mixed"),
    (49, "mixed"),
    (52, "mixed"),
    (56, "increasing"),
    (65, "increasing"),
    (71, "decreasing"),
    (79, "mixed"),
    (87, "mixed"),
    (92, "increasing"),
    (102, "mixed"),
    (111, "increasing"),
    (129, "increasing"),
]

In [None]:
# read from JSON file to get a copy that we will modify
curves_consensus = add_indices(read_json("dan-small-labeled-pdps.json"))
# set the consensus labels for the disagreements
set_consensus_labels(curves_consensus, labels_a, labels_b, corrections)

# get the consensus shape labels as a list
labels_consensus = [x["shape"] for x in curves_consensus]

# for each threshold, get the heuristic's labels and calculate
# the accuracy of the consensus labels wrt the heuristic's
df_consensus = get_scores(curves_consensus)
best_thresholds_consensus = get_best_thresholds(df_consensus)
display(best_thresholds_consensus)

# get a line chart that shows the accuracy of the
# consensus labels vs. the threshold
plot_consensus = plot_accuracy_vs_threshold(df_consensus)
display(plot_consensus)

# check for mistakes in labels
bad_labels_consensus = check_labels(
    curves_consensus,
    df_consensus[df_consensus["threshold"] == 0]["labels"].to_numpy()[0],
)
print("possible mistakes:")
print(bad_labels_consensus)

In [None]:
alt.Chart(df_consensus).mark_line().encode(
    x=alt.X("threshold").title("PDP shape labeling function tolerance parameter (t)"),
    y=alt.Y("accuracy").title("Agreement with authors' labels").axis(format=".2~%"),
).properties(width=400, height=250)

In [None]:
# check that for all of the best thresholds, the heurisitc labels are the same

heuristic_labels = best_thresholds_consensus["labels"].to_numpy()[0]

for labels in best_thresholds_consensus["labels"].to_numpy():
    assert heuristic_labels == labels

In [None]:
plot_disagreements(
    curves_a, labels_consensus, heuristic_labels, "Consensus", "Heuristic"
)