In [47]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

class LSH:
    def __init__(self, num_hashes, dim):
        self.num_hashes = num_hashes
        self.planes = np.random.randn(num_hashes, dim)  # Random hyperplanes

    def hash(self, vector):
        # Compute dot product with each plane and get 1 if >=0 else 0
        return ''.join(['1' if np.dot(vector, plane) >= 0 else '0' for plane in self.planes])

# Sample documents
docs = [
    "the cat sat on the mat",
    "the cat sat on a mat",
    "dogs are in the yard",
    "the quick brown fox jumps over the lazy dog"
]

# Step 1: TF-IDF vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(docs).toarray()

# Step 2: Hash documents using LSH
lsh = LSH(num_hashes=10, dim=tfidf_matrix.shape[1])
buckets = {}

for idx, vec in enumerate(tfidf_matrix):
    h = lsh.hash(vec)
    if h not in buckets:
        buckets[h] = []
    buckets[h].append((idx, docs[idx]))

# Step 3: Show buckets with near-duplicates
print("Buckets with near-duplicate documents:")
for bucket, items in buckets.items():
    if len(items) > 1:
        print(f"\nBucket Hash: {bucket}")
        for i in items:
            print(f"Doc {i[0]}: {i[1]}")


Buckets with near-duplicate documents:

Bucket Hash: 0010010101
Doc 0: the cat sat on the mat
Doc 1: the cat sat on a mat
