In [1]:
#  Environment setup and dataset download

import os

os.environ['KAGGLE_USERNAME'] = "alessandro777"
os.environ['KAGGLE_KEY'] = "01d9dd7620e53c0dbe1ea4ba0a4eb7a7"

!kaggle datasets download -d mohamedbakhet/amazon-books-reviews

!unzip -q amazon-books-reviews.zip


Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0
Downloading amazon-books-reviews.zip to /content
 99% 1.05G/1.06G [00:03<00:00, 281MB/s]
100% 1.06G/1.06G [00:03<00:00, 296MB/s]


In [2]:
# Dataset loading and sampling

import pandas as pd

df = pd.read_csv("Books_rating.csv")

df.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [3]:
df_small = df.head(1000).copy()
df_small.shape

(1000, 10)

In [4]:
# Text preprocessing

import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess_text(text):
    if pd.isnull(text):
        return []

    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    return tokens

In [5]:
# Tokenization example output

df_small["tokens"] = df_small["review/text"].apply(preprocess_text)

df_small[["review/text", "tokens"]].head()

Unnamed: 0,review/text,tokens
0,This is only for Julie Strain fans. It's a col...,"[julie, strain, fans, collection, photos, 80, ..."
1,I don't care much for Dr. Seuss but after read...,"[dont, care, dr, seuss, reading, philip, nels,..."
2,"If people become the books they read and if ""t...","[people, books, read, child, father, man, dr, ..."
3,"Theodore Seuss Geisel (1904-1991), aka &quot;D...","[theodore, seuss, geisel, 19041991, aka, quotd..."
4,Philip Nel - Dr. Seuss: American IconThis is b...,"[philip, nel, dr, seuss, american, iconthis, b..."


In [6]:
# Jaccard similarity function

from itertools import combinations

def jaccard_similarity(set1, set2):
    if not set1 or not set2:
        return 0.0
    return len(set1 & set2) / len(set1 | set2)

# Brute-force similarity detection

sample = df_small.head(1000)

similar_pairs = []

for i, j in combinations(range(len(sample)), 2):
    tokens_i = set(sample.iloc[i]["tokens"])
    tokens_j = set(sample.iloc[j]["tokens"])
    sim = jaccard_similarity(tokens_i, tokens_j)

    if sim >= 0.3:
        similar_pairs.append((i, j, sim))

print(f"Similar pairs found: {len(similar_pairs)}")

Similar pairs found: 23


In [7]:
sample = df_small.head(1000)

similar_pairs = []

for i, j in combinations(range(len(sample)), 2):
    tokens_i = set(sample.iloc[i]["tokens"])
    tokens_j = set(sample.iloc[j]["tokens"])
    sim = jaccard_similarity(tokens_i, tokens_j)

    if sim >= 0.05:
        similar_pairs.append((i, j, sim))

print(f"Similar pairs found: {len(similar_pairs)}")

Similar pairs found: 56665


In [8]:
# Displaying similar pairs (Jaccard)

for i, j, sim in similar_pairs[:5]:
    print(f"\n🔗 Jaccard similarity = {sim:.3f}")
    print(f"📝 Review {i}: {sample.iloc[i]['review/text'][:300]}")
    print(f"📝 Review {j}: {sample.iloc[j]['review/text'][:300]}")


🔗 Jaccard similarity = 0.061
📝 Review 0: This is only for Julie Strain fans. It's a collection of her photos -- about 80 pages worth with a nice section of paintings by Olivia.If you're looking for heavy literary content, this isn't the place to find it -- there's only about 2 pages with text and everything else is photos.Bottom line: if y
📝 Review 7: When I recieved this book as a gift for Christmas my first impression went along these line: "Oh great some professor who probably wrote alot of mumbo-jumbo that I don't want to know about Dr. Suess and I know it won't be an enjoyable read."Thanks goodness I read it nonetheless.To my pleasure, Nel h

🔗 Jaccard similarity = 0.053
📝 Review 0: This is only for Julie Strain fans. It's a collection of her photos -- about 80 pages worth with a nice section of paintings by Olivia.If you're looking for heavy literary content, this isn't the place to find it -- there's only about 2 pages with text and everything else is photos.Bottom line: if y
📝

In [9]:
# MinHash + LSH implementation

!pip install datasketch

from datasketch import MinHash, MinHashLSH

sample = df_small.head(1000).copy()

Collecting datasketch
  Downloading datasketch-1.6.5-py3-none-any.whl.metadata (5.8 kB)
Downloading datasketch-1.6.5-py3-none-any.whl (89 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasketch
Successfully installed datasketch-1.6.5


In [10]:
minhashes = []

for tokens in sample["tokens"]:
    m = MinHash(num_perm=128)
    for token in set(tokens):
        m.update(token.encode("utf8"))
    minhashes.append(m)

In [11]:
lsh = MinHashLSH(threshold=0.3, num_perm=128)

for i, m in enumerate(minhashes):
    lsh.insert(f"r{i}", m)

In [12]:
# Finding similar pairs using LSH

similar_pairs_lsh = []

for i in range(len(minhashes)):
    result = lsh.query(minhashes[i])
    for r in result:
        j = int(r[1:])  # da "r7" → 7
        if i < j:
            sim = jaccard_similarity(set(sample.iloc[i]["tokens"]),
                                     set(sample.iloc[j]["tokens"]))
            similar_pairs_lsh.append((i, j, sim))

print(f"Similar pairs found with LSH: {len(similar_pairs_lsh)}")

Similar pairs found with LSH: 582


In [13]:
for i, j, sim in similar_pairs_lsh:
    print(f"\n🔗 LSH similarity (Jaccard) = {sim:.3f}")
    print(f"📝 Review {i}: {sample.iloc[i]['review/text'][:300]}")
    print(f"📝 Review {j}: {sample.iloc[j]['review/text'][:300]}")


🔗 LSH similarity (Jaccard) = 0.027
📝 Review 1: I don't care much for Dr. Seuss but after reading Philip Nel's book I changed my mind--that's a good testimonial to the power of Rel's writing and thinking. Rel plays Dr. Seuss the ultimate compliment of treating him as a serious poet as well as one of the 20th century's most interesting visual arti
📝 Review 863: Ronnie Joe Waddell, who's been on death row for nearly a decade for a vicious murder, is executed, and Kay Scarpetta is called on to perform an autopsy on his body. At least, she thinks it's his body. On the same day of Waddell's execution, another murder occurs--this one eerily similar to the one c

🔗 LSH similarity (Jaccard) = 0.054
📝 Review 3: Theodore Seuss Geisel (1904-1991), aka &quot;Dr. Seuss,&quot; was one of the most influential writers and artists of the 20th century.In 1959, Rudolf Flesch wrote, &quot;A hundred years from now, children and their parents will still eagerly read the books of a fellow called Ted Geisel, 

In [14]:
# Overlap between brute-force and LSH

brute_set = set((i, j) for i, j, _ in similar_pairs)
lsh_set = set((i, j) for i, j, _ in similar_pairs_lsh)

common = brute_set & lsh_set

print(f"Common pairs bitween brute-force and LSH: {len(common)}")
print(f"Overlap percentage: {100 * len(common) / len(lsh_set):.2f}%")

Common pairs bitween brute-force and LSH: 520
Overlap percentage: 89.35%


In [15]:
# Sensitivity analysis on threshold values

for t in [0.3, 0.2, 0.1]:
    lsh = MinHashLSH(threshold=t, num_perm=128)
    for i, m in enumerate(minhashes):
        lsh.insert(f"r{i}", m)

    found = set()
    for i in range(len(minhashes)):
        results = lsh.query(minhashes[i])
        for r in results:
            j = int(r[1:])
            if i < j:
                found.add((i, j))

    print(f"Threshold {t} → pairs found: {len(found)}")

Threshold 0.3 → pairs found: 582
Threshold 0.2 → pairs found: 7321
Threshold 0.1 → pairs found: 16275


In [17]:
# Performance benchmarking (timing)

import time

start = time.time()

# brute-force loop
similar_pairs = []
for i, j in combinations(range(len(sample)), 2):
    tokens_i = set(sample.iloc[i]["tokens"])
    tokens_j = set(sample.iloc[j]["tokens"])
    sim = jaccard_similarity(tokens_i, tokens_j)
    if sim >= 0.05:
        similar_pairs.append((i, j, sim))

end = time.time()
print(f"Brute-force time: {end - start:.2f} seconds")

Brute-force time: 65.03 seconds


In [20]:
start = time.time()

minhashes = []
for tokens in sample["tokens"]:
    m = MinHash(num_perm=128)
    for token in set(tokens):
        m.update(token.encode("utf8"))
    minhashes.append(m)

lsh = MinHashLSH(threshold=0.3, num_perm=128)
for i, m in enumerate(minhashes):
    lsh.insert(f"r{i}", m)

similar_pairs_lsh = []
for i in range(len(minhashes)):
    result = lsh.query(minhashes[i])
    for r in result:
        j = int(r[1:])
        if i < j:
            sim = jaccard_similarity(set(sample.iloc[i]["tokens"]),
                                     set(sample.iloc[j]["tokens"]))
            similar_pairs_lsh.append((i, j, sim))

end = time.time()
print(f"MinHash + LSH time: {end - start:.2f} seconds")

MinHash + LSH time: 3.08 seconds
