In [1]:
# One of the reviewer ask about duplicates. this script tries to find all duplicates 
import csv, math
from polyleven import levenshtein
from tqdm import trange

In [2]:
DATASET = '../../Dataset/Text-Data/RP-Mod-Crowd.csv'
NAME = DATASET.split('/')[-1]
TRESHOLD_NEAR = 0.15

In [3]:
# Load dataset row-wise
rows_list = []
with open(DATASET) as f:
    reader = csv.DictReader(f)
    for row in reader:
        rows_list.append(row)

In [4]:
real_duplicates = []
near_duplibates = []
# We want to find real and near duplicates. near is defined as less than 15% distance
for _ in trange(len(rows_list)):
    base_row = rows_list.pop()
    for row in rows_list:
        # First, levenshtein does not accept relative but only absolute distance. thus, we have to convert it
        length = max(len(base_row['Text']), len(row['Text']))
        max_ths = round(length * TRESHOLD_NEAR)
        # The max_ths reduces runtime significantly
        dist = levenshtein(base_row['Text'], row['Text'], max_ths)
        if dist == 0: # Found real duplicate
            ids = {r['id'] for r in real_duplicates}
            if base_row['id'] not in ids:
                real_duplicates.append(base_row)
            if row['id'] not in ids:
                real_duplicates.append(row)
        if 1 <= dist <= max_ths: # Found near duplicate
            ids = {r['id'] for r in near_duplibates}
            if base_row['id'] not in ids:
                near_duplibates.append(base_row)
            if row['id'] not in ids:
                near_duplibates.append(row)

100%|██████████| 85000/85000 [3:47:34<00:00,  6.22it/s]  


In [5]:
print(f'Real duplicates: {len(real_duplicates)}')
print(f'Near duplicates: {len(near_duplibates)}')

Real duplicates: 815
Near duplicates: 677


In [6]:
with open('real_duplicates.csv', 'w')  as f:
    dict_writer = csv.DictWriter(f, real_duplicates[0].keys())
    dict_writer.writeheader()
    dict_writer.writerows(real_duplicates)

In [7]:
with open('near_duplibates.csv', 'w')  as f:
    dict_writer = csv.DictWriter(f, near_duplibates[0].keys())
    dict_writer.writeheader()
    dict_writer.writerows(near_duplibates)