In [1]:
import pickle
from collections import defaultdict

import datasets
import pandas as pd

### Background


This is a modified version of https://github.com/serega/gaoya/blob/master/py-gaoya/examples/deduplication_scholarly_articles_gaoya.ipynb, which benchmarks the algorithm on 
the `pinecone/core-2020-05-10-deduplication` dataset.

In [2]:
# Run the script

ds = datasets.load_dataset("pinecone/core-2020-05-10-deduplication", split="train")
ds = ds.map(lambda x: {"text": (x["processed_title"] + " " + x["processed_abstract"]).lower()})
ds.save_to_disk("temp_inp")

Found cached dataset json (/Users/chenghao/.cache/huggingface/datasets/pinecone___json/pinecone--core-2020-05-10-deduplication-dbaaf752a12c0b16/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
Loading cached processed dataset at /Users/chenghao/.cache/huggingface/datasets/pinecone___json/pinecone--core-2020-05-10-deduplication-dbaaf752a12c0b16/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-3f09ce0efdc0f6fc.arrow


Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

In [3]:
ds = datasets.load_from_disk("temp_inp")

### Truth

In [4]:
truth = ds.map(lambda x, id: {"core_id": x["core_id"], "id": id, "duplicates": x["labelled_duplicates"]}, remove_columns=ds.column_names, with_indices=True)
id2core_id = {x["id"]: int(x["core_id"]) for x in truth}
labels = {int(x["core_id"]): set(map(int, x["duplicates"])) if x["duplicates"] else set() for x in truth}

Loading cached processed dataset at /Users/chenghao/Downloads/text-dedup/benchmarks/temp_inp/cache-5249773913bfa903.arrow


### Predictions

#### MinHash

There is no character shingle tokenizer in the script, you can either modify the code or use an n-gram tokenizer. For simplicity, we use bigrams in this example. Other parameters are the same as the original script.

In [5]:
%%capture --no-display
!python -m text_dedup.minhash --path ./temp_inp --local --column text --num_perm 200 --ngram 2 --threshold 0.5 --output temp --split train --debug --b 50 --r 4

In [6]:
with open("temp/uf.pkl", "rb") as f:
    uf = pickle.load(f)

id2cluster = defaultdict(set)
for id, cluster in uf.parent.items():
    id2cluster[cluster].add(id)

predictions = {id2core_id[x["id"]]: set([id2core_id[neighbor] for neighbor in id2cluster[uf.find(x["id"])] if neighbor != x["id"]]) for x in truth}
df = pd.Series(labels).to_frame("duplicates").reset_index().merge(pd.Series(predictions).to_frame("predictions").reset_index(), on="index")

df['Correct'] = df.apply(lambda row: set(row['duplicates']) == set(row['predictions']), axis=1).astype(int)
prediction_summary = { 'Correct' : df['Correct'].sum(), 'Incorrect' : df.shape[0] - df['Correct'].sum() }
prediction_summary['Accuracy'] = round(prediction_summary['Correct'] / df.shape[0], 4)

def _recall(row):
    labelled_dups = set(row['duplicates'])
    if len(labelled_dups) == 0:
        return 1
    dups = set(row['predictions'])
    return len(dups & labelled_dups) / len(labelled_dups)
recalls = df.apply(lambda row: _recall(row), axis=1)
prediction_summary['Recall'] = round(recalls.mean(), 4)

def _precision(row):
    labelled_dups = set(row['duplicates'])
    dups = set(row['predictions'])    
    if len(dups) == 0:
        return 0

    return len(dups & labelled_dups) / len(dups)
precisions = df.apply(lambda row: _precision(row), axis=1)
prediction_summary['Precision'] = round(precisions.mean(), 4)

prediction_summary

{'Correct': 92389,
 'Incorrect': 7611,
 'Accuracy': 0.9239,
 'Recall': 0.9651,
 'Precision': 0.4432}

#### SimHash

In [7]:
%%capture --no-display
!python -m text_dedup.simhash --path ./temp_inp --local --column text --output temp_simhash --split train --debug \
    --bit_diff 6 \
    --num_bucket 7 \
    --ngram 3

In [8]:
with open("temp_simhash/uf.pkl", "rb") as f:
    uf = pickle.load(f)

id2cluster = defaultdict(set)
for id, cluster in uf.parent.items():
    id2cluster[cluster].add(id)

predictions = {id2core_id[x["id"]]: set([id2core_id[neighbor] for neighbor in id2cluster[uf.find(x["id"])] if neighbor != x["id"]]) for x in truth}
df = pd.Series(labels).to_frame("duplicates").reset_index().merge(pd.Series(predictions).to_frame("predictions").reset_index(), on="index")

df['Correct'] = df.apply(lambda row: set(row['duplicates']) == set(row['predictions']), axis=1).astype(int)
prediction_summary = { 'Correct' : df['Correct'].sum(), 'Incorrect' : df.shape[0] - df['Correct'].sum() }
prediction_summary['Accuracy'] = round(prediction_summary['Correct'] / df.shape[0], 4)

def _recall(row):
    labelled_dups = set(row['duplicates'])
    if len(labelled_dups) == 0:
        return 1
    dups = set(row['predictions'])
    return len(dups & labelled_dups) / len(labelled_dups)
recalls = df.apply(lambda row: _recall(row), axis=1)
prediction_summary['Recall'] = round(recalls.mean(), 4)

def _precision(row):
    labelled_dups = set(row['duplicates'])
    dups = set(row['predictions'])    
    if len(dups) == 0:
        return 0

    return len(dups & labelled_dups) / len(dups)
precisions = df.apply(lambda row: _precision(row), axis=1)
prediction_summary['Precision'] = round(precisions.mean(), 4)

prediction_summary

{'Correct': 82075,
 'Incorrect': 17925,
 'Accuracy': 0.8208,
 'Recall': 0.8413,
 'Precision': 0.3544}

[Deduplication of Scholarly Documents using Locality Sensitive Hashing and Word Embeddings](https://aclanthology.org/2020.lrec-1.113) (Gyawali et al., LREC 2020)

In [9]:
def classify_in_paper(record):
    duplicates = set(record['duplicates'])
    predictions = set(record['predictions'])

    if len(predictions) == 0 and len(duplicates) > 0:
        return 'FN'

    if duplicates.issubset(predictions) and len(predictions) > 0 and len(duplicates) > 0:
        return 'TP'
    
    if len(duplicates) == 0 and len(predictions) == 0:
        return 'TN'
    
    if len(predictions) > 0:
        if len(duplicates) == 0 or not duplicates.issubset(predictions):
            return 'FP'
    
    raise ValueError(f'This should not happen {duplicates} {predictions} {len(duplicates)=} {len(predictions)=}')

def inverse(label):
    if label == 'TP':
        return 'TN'
    if label == 'FN':
        return 'FP'
    if label == 'FP':
        return 'FN'
    if label == 'TN':
        return 'TP'

df['Class'] = df.apply(lambda row: classify_in_paper(row), axis=1)
df['Class_'] = df.apply(lambda row: inverse(row['Class']), axis=1)

f1s = []
for col in ['Class', 'Class_']:
    label_counts = df[col].value_counts()
    precision = label_counts['TP'] / (label_counts['TP'] + label_counts['FP'])
    recall = label_counts['TP'] / (label_counts['TP'] + label_counts['FN'])
    f1 = 2 * precision * recall / (precision + recall)
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')
    f1s.append(f1)
print(f'Macro Average F1: {sum(f1s) / len(f1s):.4f}, Accuracy: {df["Correct"].mean():.4f}')

Precision: 0.9011, Recall: 0.6959, F1: 0.7853
Precision: 0.7776, Recall: 0.9329, F1: 0.8482
Macro Average F1: 0.8168, Accuracy: 0.8207


These numbers seem too good to be true compared with what we see in the paper. Let's double check their results.

In [10]:
title2core_ids = defaultdict(set)
for record in ds:
    title = record['processed_title']
    core_id = int(record['core_id'])
    title2core_ids[title].add(core_id)

matches = ds.map(lambda row: {'matches': set(x for x in title2core_ids[row["processed_title"]] if x != int(row["core_id"]))})
matches = {int(x["core_id"]): x["matches"] for x in matches}

ddf = pd.Series(matches).to_frame("predictions").reset_index().merge(df.drop("predictions", axis=1), on="index")
ddf["Correct"] = ddf.apply(lambda row: set(row['duplicates']) == set(row['predictions']), axis=1).astype(int)
ddf['Class'] = ddf.apply(lambda row: classify_in_paper(row), axis=1)
ddf['Class_'] = ddf.apply(lambda row: inverse(row['Class']), axis=1)

f1s = []
for col in ['Class', 'Class_']:
    label_counts = ddf[col].value_counts()
    precision = label_counts['TP'] / (label_counts['TP'] + label_counts['FP'])
    recall = label_counts['TP'] / (label_counts['TP'] + label_counts['FN'])
    f1 = 2 * precision * recall / (precision + recall)
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')
    f1s.append(f1)
print(f'Macro Average F1: {sum(f1s) / len(f1s):.4f}, Accuracy: {ddf["Correct"].mean():.4f}')

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Precision: 0.8302, Recall: 0.5521, F1: 0.6632
Precision: 0.7098, Recall: 0.9065, F1: 0.7962
Macro Average F1: 0.7297, Accuracy: 0.7456


This is strange: precisions and accuracy are the same, but not the recall.

In [11]:
%%capture --no-display
!rm -r temp_inp temp* ../temp