# Evaluation

In [1]:
from bib_dedupe.dedupe_benchmark import DedupeBenchmarker
from bib_dedupe.bib_dedupe import block, match, merge, prep
from bib_dedupe.util import BibDedupeUtil
import bib_dedupe.cluster
from asreview.data import load_data, ASReviewData
from datetime import datetime
import pandas as pd

In [2]:
bd_util = BibDedupeUtil()
merge_updated_papers = True

# for benchmark_path in reversed(bd_util.get_dataset_labels()):
for benchmark_path in bd_util.get_dataset_labels():
    if benchmark_path in ["problem_cases"]: # "problem_cases" digital_work "srsr", "depression"
         continue
    print(f"Dataset: {benchmark_path}")
    
    dedupe_benchmark = DedupeBenchmarker(benchmark_path=f"../data/{benchmark_path}")
    records_df = dedupe_benchmark.get_records_for_dedupe()
    
    # Bib-dedupe
    timestamp = datetime.now()
    records_df = prep(records_df)
    actual_blocked_df = block(records_df=records_df)
    matched_df = match(actual_blocked_df)
    duplicate_id_sets = bib_dedupe.cluster.get_connected_components(matched_df)
    merged_df = merge(records_df, duplicate_id_sets=duplicate_id_sets)
    result = dedupe_benchmark.compare_dedupe_id(records_df=records_df, merged_df=merged_df, timestamp=timestamp)
    bd_util.append_to_output(result, package_name="bib-dedupe")

    # More detailed comparison for debugging
    dedupe_benchmark.export_cases(prepared_records_df=records_df, blocked_df=actual_blocked_df, matched_df=matched_df)
    
    # ASReview
    asdata = ASReviewData(records_df)
    timestamp = datetime.now()
    merged_df = asdata.drop_duplicates()
    result = dedupe_benchmark.compare_dedupe_id(records_df=records_df, merged_df=merged_df, timestamp=timestamp)
    bd_util.append_to_output(result, package_name="asreview")
    print()
    

Dataset: digital_work
Loaded 7159 records
Prep started at 2023-12-21 20:37:31
Prep completed after: 10.86 seconds
Block started at 2023-12-21 20:37:41
Blocked    24194 pairs with {'volume', 'container_title_short', 'number'}
Blocked   106595 pairs with {'volume', 'year', 'container_title_short'}
Blocked    10645 pairs with {'author_first', 'container_title_short'}
Blocked     2677 pairs with {'author_first', 'year'}
Blocked    28210 pairs with {'volume', 'year', 'number'}
Blocked      932 pairs with {'volume', 'title_short'}
Blocked     4290 pairs with {'author_first', 'title_short'}
Blocked     5620 pairs with {'container_title_short', 'title_short'}
Blocked      763 pairs with {'pages', 'title_short'}
Blocked     1059 pairs with {'year', 'title_short'}
Blocked      111 pairs with {'abstract'}
Blocked      397 pairs with {'volume', 'pages', 'container_title_short'}
Blocked      505 pairs with {'pages', 'year', 'container_title_short'}
Blocked      405 pairs with {'volume', 'pages', 'y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df.loc[:, "case"] = (


Export started at 2023-12-21 20:38:12
Export completed after: 14.91 seconds
Runtime: 0:00:00


  df = self.df[~self.duplicated(pid)]



Dataset: diabetes
Loaded 1845 records
Prep started at 2023-12-21 20:38:28
Prep completed after: 2.65 seconds
Block started at 2023-12-21 20:38:31
Blocked     3234 pairs with {'author_first', 'year'}
Blocked     2260 pairs with {'pages', 'title_short'}
Blocked     2542 pairs with {'author_first', 'title_short'}
Blocked     2622 pairs with {'year', 'title_short'}
Blocked     3343 pairs with {'author_first', 'container_title_short'}Blocked     1863 pairs with {'volume', 'container_title_short', 'number'}

Blocked     2434 pairs with {'container_title_short', 'title_short'}
Blocked     2309 pairs with {'volume', 'title_short'}
Blocked     5422 pairs with {'volume', 'year', 'container_title_short'}
Blocked     2335 pairs with {'pages', 'year', 'container_title_short'}
Blocked     2065 pairs with {'doi'}
Blocked     1974 pairs with {'volume', 'year', 'number'}
Blocked     2126 pairs with {'volume', 'pages', 'container_title_short'}
Blocked     1339 pairs with {'pages', 'year', 'number'}
Blo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df.loc[:, "case"] = (


Export started at 2023-12-21 20:38:36
Export completed after: 1.08 seconds
Runtime: 0:00:00

Dataset: respiratory
Loaded 1988 records
Prep started at 2023-12-21 20:38:38
Prep completed after: 2.89 seconds
Block started at 2023-12-21 20:38:41
Blocked      624 pairs with {'author_first', 'year'}
Blocked      467 pairs with {'volume', 'title_short'}
Blocked      509 pairs with {'author_first', 'container_title_short'}
Blocked      479 pairs with {'author_first', 'title_short'}
Blocked      440 pairs with {'pages', 'title_short'}Blocked      482 pairs with {'year', 'title_short'}

Blocked      395 pairs with {'container_title_short', 'title_short'}
Blocked        0 pairs with {'volume', 'container_title_short', 'number'}
Blocked     2058 pairs with {'volume', 'year', 'container_title_short'}
Blocked      381 pairs with {'volume', 'pages', 'container_title_short'}
Blocked      469 pairs with {'volume', 'pages', 'year'}
Blocked      382 pairs with {'pages', 'year', 'container_title_short'}
B

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df.loc[:, "case"] = (


Export started at 2023-12-21 20:38:51
Export completed after: 1.04 seconds
Runtime: 0:00:00

Dataset: srsr
Loaded 53001 records
Prep started at 2023-12-21 20:38:53
Prep completed after: 82.10 seconds
Block started at 2023-12-21 20:40:16
Blocked    29655 pairs with {'volume', 'container_title_short', 'number'}
Blocked    89194 pairs with {'author_first', 'year'}
Blocked    17239 pairs with {'pages', 'title_short'}
Blocked    26714 pairs with {'author_first', 'container_title_short'}
Blocked    21234 pairs with {'volume', 'title_short'}
Blocked    21417 pairs with {'container_title_short', 'title_short'}
Blocked    21661 pairs with {'author_first', 'title_short'}
Blocked    22341 pairs with {'year', 'title_short'}
Blocked   176650 pairs with {'volume', 'year', 'container_title_short'}
Blocked    78909 pairs with {'volume', 'year', 'number'}
Blocked    16360 pairs with {'volume', 'pages', 'container_title_short'}
Blocked    15793 pairs with {'pages', 'year', 'number'}
Blocked    21407 pai

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_paper_pairs["duplicate_label"] = "updated_version"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df.loc[:, "case"] = (


Export started at 2023-12-21 20:44:34
Export completed after: 841.69 seconds


  df = self.df[~self.duplicated(pid)]


Runtime: 0:00:02

Dataset: depression
Loaded 79880 records
Prep started at 2023-12-21 20:59:47
Prep completed after: 137.89 seconds
Block started at 2023-12-21 21:02:05
Blocked    47457 pairs with {'volume', 'container_title_short', 'number'}
Blocked    62937 pairs with {'author_first', 'year'}
Blocked    42010 pairs with {'author_first', 'container_title_short'}
Blocked   266085 pairs with {'volume', 'year', 'container_title_short'}
Blocked     4485 pairs with {'volume', 'title_short'}
Blocked     8264 pairs with {'pages', 'title_short'}
Blocked    14615 pairs with {'author_first', 'title_short'}
Blocked    14145 pairs with {'container_title_short', 'title_short'}
Blocked    14614 pairs with {'year', 'title_short'}
Blocked    68593 pairs with {'volume', 'year', 'number'}
Blocked     3688 pairs with {'doi'}
Blocked     1564 pairs with {'volume', 'pages', 'container_title_short'}
Blocked     9398 pairs with {'pages', 'year', 'container_title_short'}
Blocked     8950 pairs with {'pages',

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_paper_pairs["duplicate_label"] = "updated_version"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df.loc[:, "case"] = (


Export started at 2023-12-21 21:09:48
Export completed after: 1818.70 seconds


  df = self.df[~self.duplicated(pid)]


Runtime: 0:00:03

Dataset: neuroimaging
Loaded 3438 records
Prep started at 2023-12-21 21:42:34
Prep completed after: 4.84 seconds
Block started at 2023-12-21 21:42:39
Blocked     1731 pairs with {'volume', 'container_title_short', 'number'}
Blocked     2207 pairs with {'author_first', 'year'}
Blocked     1692 pairs with {'volume', 'title_short'}
Blocked     1699 pairs with {'author_first', 'title_short'}
Blocked     1739 pairs with {'year', 'title_short'}
Blocked     1639 pairs with {'container_title_short', 'title_short'}Blocked     1670 pairs with {'pages', 'title_short'}

Blocked     1823 pairs with {'author_first', 'container_title_short'}
Blocked     3208 pairs with {'volume', 'year', 'container_title_short'}
Blocked     1988 pairs with {'volume', 'year', 'number'}
Blocked     1392 pairs with {'pages', 'year', 'number'}
Blocked     1642 pairs with {'volume', 'pages', 'container_title_short'}
Blocked     1635 pairs with {'doi'}
Blocked     1652 pairs with {'pages', 'year', 'contai

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df.loc[:, "case"] = (


Export started at 2023-12-21 21:42:53
Export completed after: 3.16 seconds
Runtime: 0:00:00

Dataset: stroke
Loaded 1292 records
Prep started at 2023-12-21 21:42:57
Prep completed after: 1.98 seconds
Block started at 2023-12-21 21:42:59
Blocked      923 pairs with {'volume', 'container_title_short', 'number'}
Blocked      595 pairs with {'author_first', 'year'}
Blocked      445 pairs with {'pages', 'title_short'}
Blocked      474 pairs with {'author_first', 'title_short'}Blocked      480 pairs with {'year', 'title_short'}Blocked      445 pairs with {'container_title_short', 'title_short'}


Blocked      504 pairs with {'author_first', 'container_title_short'}
Blocked      465 pairs with {'volume', 'title_short'}
Blocked     3003 pairs with {'volume', 'year', 'container_title_short'}
Blocked     1279 pairs with {'volume', 'year', 'number'}
Blocked      431 pairs with {'pages', 'year', 'number'}
Blocked      425 pairs with {'volume', 'pages', 'container_title_short'}
Blocked      454 pai

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df.loc[:, "case"] = (


Export started at 2023-12-21 21:43:07
Export completed after: 0.48 seconds
Runtime: 0:00:00

Dataset: haematology
Loaded 1415 records
Prep started at 2023-12-21 21:43:08
Prep completed after: 2.34 seconds
Block started at 2023-12-21 21:43:10
Blocked     1315 pairs with {'volume', 'container_title_short', 'number'}
Blocked      459 pairs with {'author_first', 'container_title_short'}
Blocked      514 pairs with {'author_first', 'year'}
Blocked      141 pairs with {'author_first', 'title_short'}
Blocked      127 pairs with {'volume', 'title_short'}Blocked      102 pairs with {'pages', 'title_short'}

Blocked      145 pairs with {'year', 'title_short'}
Blocked      126 pairs with {'container_title_short', 'title_short'}
Blocked     3326 pairs with {'volume', 'year', 'container_title_short'}
Blocked     1662 pairs with {'volume', 'year', 'number'}
Blocked      127 pairs with {'pages', 'year', 'container_title_short'}
Blocked      128 pairs with {'volume', 'pages', 'container_title_short'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df.loc[:, "case"] = (


Export started at 2023-12-21 21:43:19
Export completed after: 0.62 seconds
Runtime: 0:00:00


  df = self.df[~self.duplicated(pid)]



Dataset: cardiac
Loaded 8948 records
Prep started at 2023-12-21 21:43:20
Prep completed after: 17.24 seconds
Block started at 2023-12-21 21:43:37
Blocked     8804 pairs with {'author_first', 'year'}
Blocked    12294 pairs with {'volume', 'container_title_short', 'number'}
Blocked     6834 pairs with {'author_first', 'container_title_short'}
Blocked     3465 pairs with {'pages', 'title_short'}
Blocked     3561 pairs with {'year', 'title_short'}
Blocked     3497 pairs with {'volume', 'title_short'}
Blocked     3527 pairs with {'author_first', 'title_short'}
Blocked    34102 pairs with {'volume', 'year', 'container_title_short'}
Blocked     3184 pairs with {'container_title_short', 'title_short'}
Blocked    13830 pairs with {'volume', 'year', 'number'}
Blocked     3589 pairs with {'doi'}
Blocked     3381 pairs with {'volume', 'pages', 'container_title_short'}
Blocked     3747 pairs with {'volume', 'pages', 'year'}
Blocked     3390 pairs with {'pages', 'year', 'container_title_short'}
Blo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df.loc[:, "case"] = (


Export started at 2023-12-21 21:44:08
Export completed after: 25.75 seconds


  df = self.df[~self.duplicated(pid)]


Runtime: 0:00:00

Dataset: cytology_screening
Loaded 1856 records
Prep started at 2023-12-21 21:44:38
Prep completed after: 2.78 seconds
Block started at 2023-12-21 21:44:41
Blocked     3662 pairs with {'volume', 'year', 'container_title_short'}
Blocked     1274 pairs with {'author_first', 'year'}
Blocked      878 pairs with {'author_first', 'title_short'}
Blocked     1139 pairs with {'author_first', 'container_title_short'}
Blocked      811 pairs with {'pages', 'title_short'}
Blocked      821 pairs with {'container_title_short', 'title_short'}
Blocked      904 pairs with {'year', 'title_short'}
Blocked      887 pairs with {'volume', 'title_short'}
Blocked      773 pairs with {'volume', 'pages', 'container_title_short'}
Blocked      836 pairs with {'volume', 'pages', 'year'}
Blocked      772 pairs with {'pages', 'year', 'container_title_short'}
Blocked 4375 pairs
Blocked pairs reduced to 1106 pairs
Block completed after: 2.88 seconds
Sim started at 2023-12-21 21:44:44
Sim completed aft

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_cases_df.loc[:, "case"] = (


Export started at 2023-12-21 21:44:47
Export completed after: 1.06 seconds
Runtime: 0:00:00

