# Evaluation

In [None]:
from colrev.ops.dedupe_benchmark import DedupeBenchmarker
from bib_dedupe.bib_dedupe import BibDeduper
from bib_dedupe.util import BibDedupeUtil
from asreview.data import load_data, ASReviewData

In [None]:
bd_util = BibDedupeUtil()

for benchmark_path in ["respiratory", "digital_work", "stroke", "haematology", "cytology_screening"]:

    dedupe_benchmark = DedupeBenchmarker(benchmark_path=f"../tests/data/{benchmark_path}")
    records_df = dedupe_benchmark.get_records_for_dedupe()
    
    # Bib-dedupe
    dedupe_instance = BibDeduper()
    actual_blocked_df = dedupe_instance.block_pairs_for_deduplication(records_df=records_df)
    matches = dedupe_instance.identify_true_matches(actual_blocked_df)
    to_drop = [o for l in matches["duplicate_origin_sets"] for o in l[1:]]
    merged_df = records_df[~records_df['colrev_origin'].isin(to_drop)]
    result = dedupe_benchmark.compare_dedupe_id(records_df=records_df, merged_df=merged_df)
    
    bd_util.append_to_output(result, package_name="bib-dedupe")
    
    # More detailed comparison for debugging
    results = dedupe_benchmark.compare(
    blocked_df=actual_blocked_df,
        predicted=matches['duplicate_origin_sets'],
    )
    dedupe_benchmark.export_cases(prepared_records_df=records_df, results=results)

    # ASReview
    asdata = ASReviewData(records_df)
    merged_df = asdata.drop_duplicates()
    result = dedupe_benchmark.compare_dedupe_id(records_df=records_df, merged_df=merged_df)

    bd_util.append_to_output(result, package_name="asreview")

# Plots

In [None]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

In [None]:
results_df = pd.read_csv("../output/evaluation.csv")
grouped_df = results_df.groupby(["package", "dataset"], group_keys=True).apply(lambda x: x.sort_values("time").tail(1)).reset_index(drop=True)

datasets = grouped_df['dataset'].unique()

for dataset in datasets:
    plt.figure(figsize=(14, 3))
    plt.suptitle(f"Dataset: {dataset}", fontsize=14, fontweight='bold')  # Added dataset as subheading title
    ax1 = plt.subplot(121)
    grouped_df[grouped_df['dataset'] == dataset].plot(ax=ax1, x="package", y="false_positive_rate", kind="barh")
    plt.title(f"False positive rate by package")
    plt.legend().remove()
    plt.ylabel("")
    for p in ax1.patches:
        ax1.annotate(f"{p.get_width():.2f}", (p.get_width(), p.get_y() + p.get_height() / 2), ha='left', va='center')

    ax2 = plt.subplot(122)
    grouped_df[grouped_df['dataset'] == dataset].plot(ax=ax2, x="package", y="sensitivity", kind="barh")
    plt.title(f"Sensitivity by package")
    plt.legend().remove()
    plt.ylabel("")
    for p in ax2.patches:
        ax2.annotate(f"{p.get_width():.2f}", (p.get_width(), p.get_y() + p.get_height() / 2), ha='left', va='center')

    latest_time = results_df["time"].max()
    plt.figtext(0.5, 0.001, f"Time of last evaluation run: {latest_time}", ha='center', fontsize=10)

    plt.tight_layout()
    # plt.show()
    plt.savefig(str(Path(f"../output/evaluation_{dataset}.png")))
    plt.close()