In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from datasketch import MinHash, MinHashLSH
from hashlib import md5

# Steps
- Shingling: Convert each document into a set of k-shingles (substrings of length k).
- MinHashing: Generate MinHash signatures for each document.
- Near-Duplicate Detection: Compare MinHash signatures to identify near-duplicates.

We'll use the datasketch library for MinHashing.

Explanation
- Shingling: The get_shingles function converts each document into a set of k-shingles, which are hashed to create unique identifiers for each shingle.
- MinHashing: The minhash_signature function creates a MinHash signature for a given set of shingles.
- LSH (Locality Sensitive Hashing): The deduplicate_documents function uses LSH to efficiently identify near-duplicate documents based on their MinHash signatures.

Parameters
- k=5: Length of each shingle (adjust based on the size and nature of the documents).
- num_perm=128: Number of permutations for MinHash (higher values increase accuracy but also computational cost).
- threshold=0.8: Similarity threshold for considering documents as near-duplicates (adjust based on desired sensitivity).


In [7]:
def get_shingles(doc, k=10):
    shingles = set()
    for i in range(len(doc) - k + 1):
        shingle = doc[i:i+k]
        single_hash = md5(shingle.encode('utf-8')).hexdigest()
        shingles.add(single_hash)
    return shingles

In [8]:
def minhash_signature(shingles, num_perm=128):
    m = MinHash(num_perm=num_perm)
    for shingle in shingles:
        m.update(shingle.encode('utf-8'))
    return m

In [14]:
def deduplicate_documents(documents, threshold=0.8):
    lsh = MinHashLSH(threshold=threshold, num_perm=128)
    minhashes = []

    for i, doc in enumerate(documents):
        shingles = get_shingles(doc)
        signature = minhash_signature(shingles)
        lsh.insert(f'doc{i}', signature)
        minhashes.append(signature)
        
    unique_docs = []
    seen = set()
    
    for i, minhash in enumerate(minhashes):
        result = lsh.query(minhash)
        if not any(doc in seen for doc in result):
            unique_docs.append(documents[i])
            seen.update(result)
    
    return unique_docs

# Testing

In [15]:
documents = [
    "This is a sample document.",
    "This document is a sample.",    
    "This document is a sample.",
    "Sample document is this.",
    "Completely different document.",
    "Another totally different document."
]

unique_docs = deduplicate_documents(documents)
print("Unique Documents:")
for doc in unique_docs:
    print(doc)

Unique Documents:
This is a sample document.
This document is a sample.
Sample document is this.
Completely different document.
Another totally different document.
