In [1]:
import os
import pickle
from collections import defaultdict

import datasets
import pandas as pd
from datasets import Features, Value, Sequence

NUM_PROC = os.cpu_count()

  from .autonotebook import tqdm as notebook_tqdm


### Background


This is a modified version of https://github.com/serega/gaoya/blob/master/py-gaoya/examples/deduplication_scholarly_articles_gaoya.ipynb, which benchmarks the algorithm on 
the `pinecone/core-2020-05-10-deduplication` dataset.

In [2]:
# Run the script
# we don't need to save the intermediate dataset into a variable unless we are debugging it
(
    datasets.load_dataset("pinecone/core-2020-05-10-deduplication", split="train",cache_dir="./cache", num_proc=NUM_PROC)
        .map(lambda x: {"text": " ".join((x["processed_title"],x["processed_abstract"])).lower()}, num_proc=NUM_PROC)
        .save_to_disk("temp_inp")
)

Downloading data: 100%|██████████| 204M/204M [01:11<00:00, 2.84MB/s] 
Setting num_proc from 10 back to 1 for the train split to disable multiprocessing as it only contains one shard.
Generating train split: 100000 examples [00:00, 490824.22 examples/s]
Map (num_proc=10): 100%|██████████| 100000/100000 [00:00<00:00, 128388.64 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 100000/100000 [00:00<00:00, 823395.78 examples/s]


In [3]:
ds = datasets.load_from_disk("temp_inp")

### Truth

In [4]:
# it seems that this code cannot be multithreaded due to some kind of race condition
truth = ds.map(
    lambda x, id: {"core_id": x["core_id"], "id": id, "duplicates": x["labelled_duplicates"]}, 
    remove_columns=ds.column_names, 
    with_indices=True,
    num_proc=os.cpu_count(),
    features=Features({
        "core_id": Value("string"),
        "id": Value("int64"),
        "duplicates": Sequence(Value("string")),
    })
)
id2core_id = {x["id"]: int(x["core_id"]) for x in truth}
labels = {int(x["core_id"]): set(map(int, x["duplicates"])) if x["duplicates"] else set() for x in truth}

Map (num_proc=10): 100%|██████████| 100000/100000 [00:00<00:00, 189508.39 examples/s]


### Predictions

#### MinHash

There is no character shingle tokenizer in the script, you can either modify the code or use an n-gram tokenizer. For simplicity, we use bigrams in this example. Other parameters are the same as the original script.

In [5]:
%%capture --no-display
!python -m text_dedup.minhash --path ./temp_inp --local --column text --num_perm 200 --ngram 2 --threshold 0.5 --output temp --split train --debug --b 50 --r 4

In [6]:
def _recall(row):
    labelled_dups = set(row['duplicates'])
    LEN_LABELLED_DUPLICATES = len(labelled_dups)    
    if LEN_LABELLED_DUPLICATES == 0:
        return 1
    dups = set(row['predictions'])
    return len(dups & labelled_dups) / LEN_LABELLED_DUPLICATES

def _precision(row):
    labelled_dups = set(row['duplicates'])
    dups = set(row['predictions'])
    LEN_DUPLICATES = len(dups)
    if LEN_DUPLICATES == 0:
        return 0
    return len(dups & labelled_dups) / LEN_DUPLICATES

with open("temp/uf.pkl", "rb") as f:
    uf = pickle.load(f)

id2cluster = defaultdict(set)
for id, cluster in uf.parent.items():
    id2cluster[cluster].add(id)

predictions = {id2core_id[x["id"]]: set([id2core_id[neighbor] for neighbor in id2cluster[uf.find(x["id"])] if neighbor != x["id"]]) for x in truth}
df = pd.Series(labels).to_frame("duplicates").reset_index().merge(pd.Series(predictions).to_frame("predictions").reset_index(), on="index")

df['Correct'] = df.apply(lambda row: set(row['duplicates']) == set(row['predictions']), axis=1).astype(int)
prediction_summary = { 'Correct' : df['Correct'].sum(), 'Incorrect' : df.shape[0] - df['Correct'].sum() }
prediction_summary['Accuracy'] = round(prediction_summary['Correct'] / df.shape[0], 4)

recalls = df.apply(lambda row: _recall(row), axis=1)
prediction_summary['Recall'] = round(recalls.mean(), 4)

precisions = df.apply(lambda row: _precision(row), axis=1)
prediction_summary['Precision'] = round(precisions.mean(), 4)

prediction_summary

{'Correct': 92396,
 'Incorrect': 7604,
 'Accuracy': 0.924,
 'Recall': 0.9676,
 'Precision': 0.4433}

#### SimHash

In [7]:
%%capture --no-display
!python -m text_dedup.simhash --path ./temp_inp --local --column text --output temp_simhash --split train --debug \
    --bit_diff 6 \
    --num_bucket 7 \
    --ngram 3

In [8]:
def _recall(row):
    labelled_dups = set(row['duplicates'])
    LEN_LABELLED_DUPLICATES = len(labelled_dups)    
    if LEN_LABELLED_DUPLICATES == 0:
        return 1
    dups = set(row['predictions'])
    return len(dups & labelled_dups) / LEN_LABELLED_DUPLICATES

def _precision(row):
    labelled_dups = set(row['duplicates'])
    dups = set(row['predictions'])
    LEN_DUPLICATES = len(dups)
    if LEN_DUPLICATES == 0:
        return 0
    return len(dups & labelled_dups) / LEN_DUPLICATES

with open("temp_simhash/uf.pkl", "rb") as f:
    uf = pickle.load(f)

id2cluster = defaultdict(set)
for id, cluster in uf.parent.items():
    id2cluster[cluster].add(id)

predictions = {id2core_id[x["id"]]: set([id2core_id[neighbor] for neighbor in id2cluster[uf.find(x["id"])] if neighbor != x["id"]]) for x in truth}
df = pd.Series(labels).to_frame("duplicates").reset_index().merge(pd.Series(predictions).to_frame("predictions").reset_index(), on="index")

df['Correct'] = df.apply(lambda row: set(row['duplicates']) == set(row['predictions']), axis=1).astype(int)
prediction_summary = { 'Correct' : df['Correct'].sum(), 'Incorrect' : df.shape[0] - df['Correct'].sum() }
prediction_summary['Accuracy'] = round(prediction_summary['Correct'] / df.shape[0], 4)

recalls = df.apply(lambda row: _recall(row), axis=1)
prediction_summary['Recall'] = round(recalls.mean(), 4)

precisions = df.apply(lambda row: _precision(row), axis=1)
prediction_summary['Precision'] = round(precisions.mean(), 4)

prediction_summary

{'Correct': 81371,
 'Incorrect': 18629,
 'Accuracy': 0.8137,
 'Recall': 0.8332,
 'Precision': 0.347}

[Deduplication of Scholarly Documents using Locality Sensitive Hashing and Word Embeddings](https://aclanthology.org/2020.lrec-1.113) (Gyawali et al., LREC 2020)

In [9]:
def classify_in_paper(record):
    duplicates = set(record['duplicates'])
    predictions = set(record['predictions'])
    
    LEN_PREDICTIONS = len(predictions)
    LEN_DUPLICATES = len(duplicates)

    # if len(predictions) == 0 it is Negative whether True or not. 
    # Hopefully True is more common and short circuit ifs
    if LEN_PREDICTIONS == 0:
        if LEN_DUPLICATES == 0:
            return 'TN'
        if LEN_DUPLICATES > 0:
            return 'FN'

    # If len(predictions) > 0 it is Positive whether True or not.
    # Hopefully True is more common and short circuit ifs
    # python uses short circuiting so this is more readable and faster
    if LEN_PREDICTIONS > 0:
        if LEN_DUPLICATES > 0 and duplicates.issubset(predictions):
            return 'TP'
        if LEN_DUPLICATES == 0 or not duplicates.issubset(predictions):
            return 'FP'
    
    raise ValueError(f'This should not happen {duplicates} {predictions} {len(duplicates)=} {len(predictions)=}')

def inverse(label: str) ->str:
    # inverts the results basically N->P and P->N
    return {'TP': 'TN', 'FN': 'FP', 'FP': 'FN', 'TN': 'TP'}[label]

df['Class'] = df.apply(lambda row: classify_in_paper(row), axis=1)
df['Class_'] = df.apply(lambda row: inverse(row['Class']), axis=1)

f1s = []
for col in ['Class', 'Class_']:
    label_counts = df[col].value_counts()
    precision = label_counts['TP'] / (label_counts['TP'] + label_counts['FP'])
    recall = label_counts['TP'] / (label_counts['TP'] + label_counts['FN'])
    f1 = 2 * precision * recall / (precision + recall)
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')
    f1s.append(f1)
print(f'Macro Average F1: {sum(f1s) / len(f1s):.4f}, Accuracy: {df["Correct"].mean():.4f}')

Precision: 0.9007, Recall: 0.6786, F1: 0.7740
Precision: 0.7681, Recall: 0.9343, F1: 0.8431
Macro Average F1: 0.8086, Accuracy: 0.8137


These numbers seem too good to be true compared with what we see in the paper. Let's double check their results.

In [10]:
title2core_ids = defaultdict(set)
for record in ds:
    title = record['processed_title']
    core_id = int(record['core_id'])
    title2core_ids[title].add(core_id)

matches = ds.map(lambda row: {'matches': set(x for x in title2core_ids[row["processed_title"]] if x != int(row["core_id"]))})
matches = {int(x["core_id"]): x["matches"] for x in matches}

ddf = pd.Series(matches).to_frame("predictions").reset_index().merge(df.drop("predictions", axis=1), on="index")
ddf["Correct"] = ddf.apply(lambda row: set(row['duplicates']) == set(row['predictions']), axis=1).astype(int)
ddf['Class'] = ddf.apply(lambda row: classify_in_paper(row), axis=1)
ddf['Class_'] = ddf.apply(lambda row: inverse(row['Class']), axis=1)

f1s = []
for col in ['Class', 'Class_']:
    label_counts = ddf[col].value_counts()
    precision = label_counts['TP'] / (label_counts['TP'] + label_counts['FP'])
    recall = label_counts['TP'] / (label_counts['TP'] + label_counts['FN'])
    f1 = 2 * precision * recall / (precision + recall)
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')
    f1s.append(f1)
print(f'Macro Average F1: {sum(f1s) / len(f1s):.4f}, Accuracy: {ddf["Correct"].mean():.4f}')

Map: 100%|██████████| 100000/100000 [00:05<00:00, 19096.76 examples/s]


Precision: 0.8302, Recall: 0.5521, F1: 0.6632
Precision: 0.7098, Recall: 0.9065, F1: 0.7962
Macro Average F1: 0.7297, Accuracy: 0.7456


This is strange: precisions and accuracy are the same, but not the recall.

In [11]:
# don't forget to cleanup cache files.
ds.cleanup_cache_files()

11

In [12]:
%%capture --no-display
!rm -r temp_inp temp* ../temp