# Evaluation

In [1]:
from bib_dedupe.dedupe_benchmark import DedupeBenchmarker
from bib_dedupe.bib_dedupe import BibDeduper
from bib_dedupe.util import BibDedupeUtil
import bib_dedupe.cluster
from asreview.data import load_data, ASReviewData
from datetime import datetime
import pandas as pd

In [2]:
bd_util = BibDedupeUtil()
merge_updated_papers = True

# for benchmark_path in reversed(bd_util.get_dataset_labels()):
for benchmark_path in bd_util.get_dataset_labels():
    if benchmark_path in ["problem_cases"]: # "problem_cases" digital_work "srsr", "depression"
         continue
    print(f"Dataset: {benchmark_path}")
    
    dedupe_benchmark = DedupeBenchmarker(benchmark_path=f"../data/{benchmark_path}", merge_updated_papers=merge_updated_papers)
    records_df = dedupe_benchmark.get_records_for_dedupe()
    
    # Bib-dedupe
    dedupe_instance = BibDeduper()
    timestamp = datetime.now()
    actual_blocked_df = dedupe_instance.block(records_df=records_df)
    matched_df = dedupe_instance.match(actual_blocked_df, merge_updated_papers=merge_updated_papers)
    duplicate_id_sets = bib_dedupe.cluster.get_connected_components(matched_df)
    merged_df = dedupe_instance.merge(records_df, duplicate_id_sets=duplicate_id_sets)
    result = dedupe_benchmark.compare_dedupe_id(records_df=records_df, merged_df=merged_df, timestamp=timestamp)
    bd_util.append_to_output(result, package_name="bib-dedupe")

    # More detailed comparison for debugging
    dedupe_benchmark.export_cases(prepared_records_df=records_df, blocked_df=actual_blocked_df, matched_df=matched_df)
    
    # ASReview
    asdata = ASReviewData(records_df)
    timestamp = datetime.now()
    merged_df = asdata.drop_duplicates()
    result = dedupe_benchmark.compare_dedupe_id(records_df=records_df, merged_df=merged_df, timestamp=timestamp)
    bd_util.append_to_output(result, package_name="asreview")
    print()
    

Dataset: digital_work
7159 records
Prep started at 2023-12-13 10:08:20
Prep completed after: 8.51 seconds
Block started at 2023-12-13 10:08:28
Blocked      104 pairs with {'abstract', 'short_title'}
Blocked    10645 pairs with {'short_container_title', 'first_author'}
Blocked     2677 pairs with {'first_author', 'year'}
Blocked    28210 pairs with {'number', 'volume', 'year'}
Blocked      763 pairs with {'short_title', 'pages'}
Blocked      932 pairs with {'short_title', 'volume'}
Blocked     5620 pairs with {'short_title', 'short_container_title'}
Blocked      445 pairs with {'doi'}
Blocked     4290 pairs with {'short_title', 'first_author'}
Blocked    24194 pairs with {'number', 'short_container_title', 'volume'}
Blocked   106595 pairs with {'short_container_title', 'volume', 'year'}
Blocked     1059 pairs with {'short_title', 'year'}
Blocked      368 pairs with {'year', 'number', 'pages'}
Blocked      405 pairs with {'year', 'volume', 'pages'}
Blocked      397 pairs with {'short_con

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df["case"] = maybe_cases_df["ID_1"] + ";" + maybe_cases_df["ID_2"]


Export started at 2023-12-13 10:09:09
Export completed after: 14.47 seconds
Runtime: 0:00:00


  df = self.df[~self.duplicated(pid)]



Dataset: diabetes
1845 records
Prep started at 2023-12-13 10:09:25
setting missing status
Prep completed after: 2.90 seconds
Block started at 2023-12-13 10:09:27
Blocked     2065 pairs with {'doi'}
Blocked     3234 pairs with {'first_author', 'year'}
Blocked     2260 pairs with {'short_title', 'pages'}
Blocked     2542 pairs with {'short_title', 'first_author'}
Blocked     2622 pairs with {'short_title', 'year'}
Blocked     3343 pairs with {'short_container_title', 'first_author'}Blocked     2434 pairs with {'short_title', 'short_container_title'}

Blocked     2309 pairs with {'short_title', 'volume'}
Blocked     1974 pairs with {'number', 'volume', 'year'}
Blocked     1730 pairs with {'abstract', 'short_title'}
Blocked     1339 pairs with {'year', 'number', 'pages'}
Blocked     2138 pairs with {'year', 'volume', 'pages'}
Blocked     1863 pairs with {'number', 'short_container_title', 'volume'}
Blocked     5422 pairs with {'short_container_title', 'volume', 'year'}
Blocked     2335 pa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df["case"] = maybe_cases_df["ID_1"] + ";" + maybe_cases_df["ID_2"]


Export started at 2023-12-13 10:09:34
Export completed after: 0.97 seconds
Runtime: 0:00:00

Dataset: respiratory
1988 records
Prep started at 2023-12-13 10:09:35
setting missing status
Prep completed after: 2.88 seconds
Block started at 2023-12-13 10:09:38
Blocked      479 pairs with {'short_title', 'first_author'}
Blocked      440 pairs with {'short_title', 'pages'}
Blocked      624 pairs with {'first_author', 'year'}
Blocked      467 pairs with {'short_title', 'volume'}
Blocked      509 pairs with {'short_container_title', 'first_author'}
Blocked      395 pairs with {'short_title', 'short_container_title'}
Blocked      482 pairs with {'short_title', 'year'}
Blocked        0 pairs with {'number', 'volume', 'year'}
Blocked     2058 pairs with {'short_container_title', 'volume', 'year'}
Blocked      469 pairs with {'year', 'volume', 'pages'}
Blocked      381 pairs with {'short_container_title', 'volume', 'pages'}
Blocked      382 pairs with {'year', 'short_container_title', 'pages'}
Bl

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df["case"] = maybe_cases_df["ID_1"] + ";" + maybe_cases_df["ID_2"]


Export started at 2023-12-13 10:09:46
Export completed after: 1.17 seconds
Runtime: 0:00:00

Dataset: srsr
53001 records
Prep started at 2023-12-13 10:09:49
setting missing status
Prep completed after: 84.14 seconds
Block started at 2023-12-13 10:11:13
Blocked    89194 pairs with {'first_author', 'year'}
Blocked    21407 pairs with {'doi'}
Blocked    17239 pairs with {'short_title', 'pages'}
Blocked    21234 pairs with {'short_title', 'volume'}
Blocked    21661 pairs with {'short_title', 'first_author'}
Blocked    26641 pairs with {'short_container_title', 'first_author'}
Blocked     4682 pairs with {'abstract', 'short_title'}
Blocked    21391 pairs with {'short_title', 'short_container_title'}
Blocked    78909 pairs with {'number', 'volume', 'year'}
Blocked   176649 pairs with {'short_container_title', 'volume', 'year'}
Blocked    29642 pairs with {'number', 'short_container_title', 'volume'}
Blocked    15793 pairs with {'year', 'number', 'pages'}
Blocked    22341 pairs with {'short_t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_paper_pairs["duplicate_label"] = "updated_version"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df["case"] = maybe_cases_df["ID_1"] + ";" + maybe_cases_df["ID_2"]


Export started at 2023-12-13 10:15:53
Export completed after: 1324.70 seconds


  df = self.df[~self.duplicated(pid)]


Runtime: 0:00:02

Dataset: depression
79880 records
Prep started at 2023-12-13 10:39:19
setting missing status
Prep completed after: 145.83 seconds
Block started at 2023-12-13 10:41:45
Blocked     3688 pairs with {'doi'}
Blocked    62937 pairs with {'first_author', 'year'}
Blocked    42002 pairs with {'short_container_title', 'first_author'}
Blocked     9755 pairs with {'abstract', 'short_title'}
Blocked     4485 pairs with {'short_title', 'volume'}
Blocked     8262 pairs with {'short_title', 'pages'}
Blocked    14613 pairs with {'short_title', 'first_author'}
Blocked    14144 pairs with {'short_title', 'short_container_title'}
Blocked    68593 pairs with {'number', 'volume', 'year'}
Blocked    14612 pairs with {'short_title', 'year'}
Blocked   266085 pairs with {'short_container_title', 'volume', 'year'}
Blocked    47457 pairs with {'number', 'short_container_title', 'volume'}
Blocked     8950 pairs with {'year', 'number', 'pages'}
Blocked     1627 pairs with {'year', 'volume', 'pages

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_paper_pairs["duplicate_label"] = "updated_version"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df["case"] = maybe_cases_df["ID_1"] + ";" + maybe_cases_df["ID_2"]


Export started at 2023-12-13 10:50:05
Export completed after: 2038.30 seconds


  df = self.df[~self.duplicated(pid)]


Runtime: 0:00:04

Dataset: neuroimaging
3438 records
Prep started at 2023-12-13 11:26:31
setting missing status
Prep completed after: 5.19 seconds
Block started at 2023-12-13 11:26:36
Blocked     1635 pairs with {'doi'}
Blocked     2207 pairs with {'first_author', 'year'}
Blocked     1670 pairs with {'short_title', 'pages'}
Blocked     1692 pairs with {'short_title', 'volume'}
Blocked     1774 pairs with {'short_container_title', 'first_author'}Blocked     1699 pairs with {'short_title', 'first_author'}

Blocked     1629 pairs with {'short_title', 'short_container_title'}
Blocked      959 pairs with {'abstract', 'short_title'}
Blocked     1988 pairs with {'number', 'volume', 'year'}
Blocked     3205 pairs with {'short_container_title', 'volume', 'year'}
Blocked     1392 pairs with {'year', 'number', 'pages'}
Blocked     1731 pairs with {'number', 'short_container_title', 'volume'}
Blocked     1739 pairs with {'short_title', 'year'}
Blocked     1719 pairs with {'year', 'volume', 'pages'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df["case"] = maybe_cases_df["ID_1"] + ";" + maybe_cases_df["ID_2"]


Export started at 2023-12-13 11:26:52
Export completed after: 3.28 seconds
Runtime: 0:00:00

Dataset: stroke
1292 records
Prep started at 2023-12-13 11:26:56
setting missing status
Prep completed after: 2.04 seconds
Block started at 2023-12-13 11:26:58
Blocked     1279 pairs with {'number', 'volume', 'year'}
Blocked      501 pairs with {'short_container_title', 'first_author'}
Blocked      465 pairs with {'short_title', 'volume'}
Blocked      474 pairs with {'short_title', 'first_author'}
Blocked      445 pairs with {'short_title', 'short_container_title'}Blocked      445 pairs with {'short_title', 'pages'}
Blocked      480 pairs with {'short_title', 'year'}

Blocked      595 pairs with {'first_author', 'year'}
Blocked      454 pairs with {'year', 'volume', 'pages'}
Blocked     3003 pairs with {'short_container_title', 'volume', 'year'}
Blocked      923 pairs with {'number', 'short_container_title', 'volume'}
Blocked      431 pairs with {'year', 'number', 'pages'}
Blocked      425 pair

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df["case"] = maybe_cases_df["ID_1"] + ";" + maybe_cases_df["ID_2"]


Export started at 2023-12-13 11:27:07
Export completed after: 0.67 seconds
Runtime: 0:00:00

Dataset: haematology
1415 records
Prep started at 2023-12-13 11:27:08
setting missing status
Prep completed after: 3.02 seconds
Block started at 2023-12-13 11:27:11
Blocked     1662 pairs with {'number', 'volume', 'year'}
Blocked      514 pairs with {'first_author', 'year'}
Blocked      102 pairs with {'short_title', 'pages'}
Blocked      125 pairs with {'short_title', 'short_container_title'}
Blocked      432 pairs with {'short_container_title', 'first_author'}
Blocked      127 pairs with {'short_title', 'volume'}
Blocked      145 pairs with {'short_title', 'year'}Blocked      141 pairs with {'short_title', 'first_author'}

Blocked     3326 pairs with {'short_container_title', 'volume', 'year'}
Blocked      139 pairs with {'year', 'volume', 'pages'}
Blocked      116 pairs with {'year', 'number', 'pages'}
Blocked     1315 pairs with {'number', 'short_container_title', 'volume'}
Blocked      128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df["case"] = maybe_cases_df["ID_1"] + ";" + maybe_cases_df["ID_2"]


Export started at 2023-12-13 11:27:21
Export completed after: 0.64 seconds
Runtime: 0:00:00


  df = self.df[~self.duplicated(pid)]



Dataset: cardiac
8948 records
Prep started at 2023-12-13 11:27:23
setting missing status
Prep completed after: 18.82 seconds
Block started at 2023-12-13 11:27:42
Blocked     8804 pairs with {'first_author', 'year'}
Blocked     3589 pairs with {'doi'}
Blocked     3465 pairs with {'short_title', 'pages'}
Blocked     3525 pairs with {'short_title', 'first_author'}
Blocked     6833 pairs with {'short_container_title', 'first_author'}
Blocked     3497 pairs with {'short_title', 'volume'}
Blocked     3184 pairs with {'short_title', 'short_container_title'}
Blocked    13830 pairs with {'number', 'volume', 'year'}
Blocked    34102 pairs with {'short_container_title', 'volume', 'year'}
Blocked     3559 pairs with {'short_title', 'year'}
Blocked    12294 pairs with {'number', 'short_container_title', 'volume'}
Blocked     3747 pairs with {'year', 'volume', 'pages'}
Blocked     1065 pairs with {'abstract', 'short_title'}
Blocked     3542 pairs with {'year', 'number', 'pages'}
Blocked     3390 pa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df["case"] = maybe_cases_df["ID_1"] + ";" + maybe_cases_df["ID_2"]


Export started at 2023-12-13 11:28:27
Export completed after: 28.81 seconds


  df = self.df[~self.duplicated(pid)]


Runtime: 0:00:00

Dataset: cytology_screening
1856 records
Prep started at 2023-12-13 11:29:00
setting missing status
Prep completed after: 3.05 seconds
Block started at 2023-12-13 11:29:03
Blocked     1274 pairs with {'first_author', 'year'}
Blocked     1129 pairs with {'short_container_title', 'first_author'}
Blocked      836 pairs with {'year', 'volume', 'pages'}
Blocked      904 pairs with {'short_title', 'year'}Blocked      878 pairs with {'short_title', 'first_author'}Blocked      887 pairs with {'short_title', 'volume'}


Blocked      811 pairs with {'short_title', 'pages'}
Blocked      821 pairs with {'short_title', 'short_container_title'}
Blocked     3662 pairs with {'short_container_title', 'volume', 'year'}
Blocked      772 pairs with {'year', 'short_container_title', 'pages'}
Blocked      773 pairs with {'short_container_title', 'volume', 'pages'}
4369
Blocked 1106 pairs
Block completed after: 3.58 seconds
Sim started at 2023-12-13 11:29:06
Sim completed after: 1.16 second

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df["case"] = maybe_cases_df["ID_1"] + ";" + maybe_cases_df["ID_2"]


Export started at 2023-12-13 11:29:11
Export completed after: 1.19 seconds
Runtime: 0:00:00

