

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/CATrian1090/Tanase-AMD-2025/blob/main/TanaseAmd.ipynb)


In [None]:
pip install datasketch

Collecting datasketch
  Downloading datasketch-1.6.5-py3-none-any.whl.metadata (5.8 kB)
Downloading datasketch-1.6.5-py3-none-any.whl (89 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasketch
Successfully installed datasketch-1.6.5


In [None]:
# Configuration
USE_SAMPLE = True
SAMPLE_SIZE = 10000

import os
import pandas as pd
from datasketch import MinHash, MinHashLSH
from tqdm import tqdm


In [None]:
os.environ['KAGGLE_USERNAME'] = "XXXXXX"
os.environ['KAGGLE_KEY'] = "XXXXXX"

!kaggle datasets download -d mohamedbakhet/amazon-books-reviews
!unzip -q "*.zip" -d /content/

Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0
Downloading amazon-books-reviews.zip to /content
 97% 1.03G/1.06G [00:07<00:00, 94.8MB/s]
100% 1.06G/1.06G [00:07<00:00, 159MB/s] 


In [None]:
# Load data
df = pd.read_csv('Books_rating.csv')
print(f"Total reviews: {len(df)}")

Total reviews: 3000000


In [None]:
# Remove missing reviews and duplicates
df = df.dropna(subset=['review/text'])
print(f"Reviews after removing missing: {len(df)}")
df = df.drop_duplicates(subset=['review/text'], keep='first')
print(f"Reviews after removing exact duplicates: {len(df)}")

# Sample data if specified
if USE_SAMPLE:
    df = df.sample(n=min(SAMPLE_SIZE, len(df)), random_state=42)
    print(f"Using sample of {len(df)} reviews")

Reviews after removing missing: 2999992
Reviews after removing exact duplicates: 2062648
Using sample of 10000 reviews


In [None]:
# Text Preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

def safe_tokenize(text):
    try:
        return word_tokenize(text)
    except:
        return text.split()

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
translator = str.maketrans('', '', string.punctuation)

def preprocess_text(text):
    if not isinstance(text, str):
        return tuple()
    text = text.lower().translate(translator)
    tokens = safe_tokenize(text)
    return tuple(sorted(set(lemmatizer.lemmatize(word) for word in tokens
                          if word not in stop_words and len(word) > 2)))

df['tokens'] = df['review/text'].apply(preprocess_text)
df = df.drop_duplicates(subset=['tokens'], keep='first')
df = df[df['tokens'].apply(len) >= 5].reset_index(drop=True)
print(f"Final reviews: {len(df)}")

Final reviews: 9983


In [None]:
# MinHashing and LSH
def create_minhash(tokens, num_perm=128):
    """Create MinHash signature for a tuple of tokens"""
    minhash = MinHash(num_perm=num_perm)
    for token in tokens:
        minhash.update(token.encode('utf8'))
    return minhash

minhashes = []
for tokens in tqdm(df['tokens'], desc="Computing MinHashes"):
    minhash = create_minhash(tokens, 128)
    minhashes.append(minhash)

lsh = MinHashLSH(threshold=0.5, num_perm=128)
for i, minhash in enumerate(tqdm(minhashes, desc="Building LSH")):
    lsh.insert(str(i), minhash)

Computing MinHashes: 100%|██████████| 9983/9983 [00:20<00:00, 498.63it/s]
Building LSH: 100%|██████████| 9983/9983 [00:01<00:00, 9449.08it/s] 


In [None]:


# Finding Similar Pairs and Results
def jaccard_similarity(tuple1, tuple2):
    """Calculate exact Jaccard similarity between two tuples (convert to sets)"""
    set1 = set(tuple1)
    set2 = set(tuple2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0.0
print("Finding similar pairs...")
similar_pairs = []

for i, minhash in enumerate(tqdm(minhashes, desc="Finding candidates")):
    candidates = lsh.query(minhash)
    for candidate_str in candidates:
        j = int(candidate_str)
        if j > i:
            # Calculate exact Jaccard similarity
            similarity = jaccard_similarity(df.iloc[i]['tokens'], df.iloc[j]['tokens'])
            if similarity > 0.0:
                similar_pairs.append((similarity, i, j))

# Sort by similarity and get top pairs
similar_pairs.sort(reverse=True)
top_similar_pairs = similar_pairs[:20]

print(f"\nTop 20 Most Similar Review Pairs:")
print("=" * 80)
for rank, (similarity, i, j) in enumerate(top_similar_pairs, 1):
    review1 = df.iloc[i]['review/text']
    review2 = df.iloc[j]['review/text']

    # Truncate reviews for display
    review1_display = review1[:150] + "..." if len(review1) > 150 else review1
    review2_display = review2[:150] + "..." if len(review2) > 150 else review2

    print(f"\nRank {rank}: Jaccard Similarity = {similarity:.4f}")
    print(f"Review {i}: {review1_display}")
    print(f"Review {j}: {review2_display}")
    print("-" * 80)


Finding similar pairs...


Finding candidates: 100%|██████████| 9983/9983 [00:00<00:00, 20033.07it/s]


Top 20 Most Similar Review Pairs:

Rank 1: Jaccard Similarity = 0.5000
Review 4719: Absolutely wonderful series of books. I can't wait to read the next one and then I start all over again!
Review 4942: I really got into this book and can't wait for the next one. It was a wonderful story from start to finish.
--------------------------------------------------------------------------------

Rank 2: Jaccard Similarity = 0.5000
Review 20: I really enjoyed this book; it's a must read
Review 3357: My son is 7 and I read this book aloud to him. He really enjoyed the adventure!Great read!
--------------------------------------------------------------------------------

Rank 3: Jaccard Similarity = 0.4545
Review 5546: Good book. great its free. It took me no time at all to get hooked. This is highly recommended. Great!
Review 7188: It took some time for me to get into this book, but when I did...it was GREAT!!
--------------------------------------------------------------------------------

Ra




In [None]:
# Save results to CSV
results_df = pd.DataFrame(top_similar_pairs, columns=['similarity', 'index1', 'index2'])
results_df['review1'] = results_df['index1'].apply(lambda x: df.iloc[x]['review/text'])
results_df['review2'] = results_df['index2'].apply(lambda x: df.iloc[x]['review/text'])
results_df.to_csv('similar_review_pairs.csv', index=False)
print(f"\nResults saved to 'similar_review_pairs.csv'")


Results saved to 'similar_review_pairs.csv'
