In [None]:
import re
import random
import numpy as np


In [None]:
# Step 1: Shingling
def get_shingles(text, k=3):
    text = re.sub(r'\W+', '', text.lower())  # clean text
    return {text[i:i+k] for i in range(len(text)-k+1)}

doc1 = "Machine learning is fun to learn"
doc2 = "Learning machines can be fun"

shingles1 = get_shingles(doc1, k=3)
shingles2 = get_shingles(doc2, k=3)

print("Shingles1:", list(shingles1)[:5])
print("Shingles2:", list(shingles2)[:5])

# Step 2: Jaccard Similarity
def jaccard(set1, set2):
    return len(set1 & set2) / len(set1 | set2)

true_jaccard = jaccard(shingles1, shingles2)
print("Jaccard Similarity:", true_jaccard)

# Step 3: MinHashing
def minhash(shingle_sets, num_hashes=100):
    universe = list(set().union(*shingle_sets))
    n = len(universe)
    signatures = np.full((len(shingle_sets), num_hashes), np.inf)

    a = np.random.randint(1, n, size=num_hashes)
    b = np.random.randint(0, n, size=num_hashes)
    p = 2**61 - 1

    for i, shingles in enumerate(shingle_sets):
        for j, sh in enumerate(universe):
            if sh in shingles:
                for k in range(num_hashes):
                    hash_val = (a[k] * j + b[k]) % p
                    if hash_val < signatures[i][k]:
                        signatures[i][k] = hash_val
    return signatures

signatures = minhash([shingles1, shingles2], num_hashes=100)
approx_jaccard = np.mean(signatures[0] == signatures[1])

print("Approx Jaccard (MinHash):", approx_jaccard)

Shingles1: ['ine', 'fun', 'ele', 'mac', 'chi']
Shingles2: ['anb', 'gma', 'ine', 'fun', 'esc']
Jaccard Similarity: 0.375
Approx Jaccard (MinHash): 0.0
