<a href="https://colab.research.google.com/github/GHMelany/AMD_project/blob/main/AMD%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

PROJECT: Finding similar items. Implement a detector of pairs of similar book reviews


LIBRARIES

In [1]:
!pip install langid




In [2]:
# Disinstalla pyspark attuale
!pip uninstall -y pyspark

# Installa PySpark stabile
!pip install pyspark==3.4.1

# Imposta JAVA_HOME (Java 11 va bene per 3.4.1)
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# Avvia Spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MyApp").getOrCreate()
print("Spark versione:", spark.version)


[0mCollecting pyspark==3.4.1
  Using cached pyspark-3.4.1-py2.py3-none-any.whl
Installing collected packages: pyspark
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dataproc-spark-connect 0.8.3 requires pyspark[connect]~=3.5.1, but you have pyspark 3.4.1 which is incompatible.[0m[31m
[0mSuccessfully installed pyspark-3.4.1
Spark versione: 3.4.1


In [3]:
import re, html, langid
import os
import zipfile
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StringType
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, NGram, HashingTF, MinHashLSH
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import explode
import math
from pyspark.ml.feature import HashingTF, MinHashLSH
from pyspark.sql.functions import explode, col

spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)



In [4]:
os.environ['KAGGLE_USERNAME'] = "melanygomez"
os.environ['KAGGLE_KEY'] = "38db1cce93622035560027022e9cafc"

!pip install -q kaggle

from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

!kaggle datasets download -d mohamedbakhet/amazon-books-reviews

Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0
amazon-books-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
zip_path = "amazon-books-reviews.zip"
extract_dir = "amazon_books_reviews"
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_dir)

for root, dirs, files in os.walk(extract_dir):
    for file in files:
        print(os.path.join(root, file))

folder = "amazon_books_reviews"
csv_path = os.path.join(folder, "Books_rating.csv")



amazon_books_reviews/books_data.csv
amazon_books_reviews/Books_rating.csv


In [6]:
df = spark.read.csv(csv_path, header=True, inferSchema=True, quote='"', escape='"')
df = df.select("Id", F.col("review/score").alias("score"), F.col("review/text").alias("text")).dropna(subset=["text"])

In [7]:
df = df.sample(fraction=0.05, seed=42)
print("Initial sample:", df.count())

Campione iniziale: 150324


In [8]:
#HTML decoding + lowercase conversion
decode_clean = udf(lambda txt: re.sub(r"\s+", " ", html.unescape(txt).lower()) if txt else "", StringType())
df = df.withColumn("review_clean", decode_clean(col("text")))

# remove special characters
regex_clean = udf(lambda t: re.sub(r"[^a-z0-9 ]+", " ", t) if t else "", StringType())
df = df.withColumn("review_clean", regex_clean(col("review_clean")))


In [9]:
sentiment_udf = udf(lambda s: "positive" if s > 3 else "negative", StringType())
df = df.withColumn("sentiment", sentiment_udf(col("score")))

In [10]:
for c in ["tokens", "filtered_tokens"]:
    if c in df.columns:
        df = df.drop(c)

tokenizer = RegexTokenizer(inputCol="review_clean", outputCol="tokens", pattern="\\W+")
stop_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
pipeline = Pipeline(stages=[tokenizer, stop_remover])
df = pipeline.fit(df).transform(df)


In [11]:
df = df.withColumn("token_count", F.size(col("filtered_tokens")))
df = df.filter((col("token_count") >= 5) & (col("token_count") <= 200))
print("reviews after length filter:", df.count())

Recensioni dopo filtro lunghezza: 139612


In [12]:
def make_shingles(words, k=3):
    if not words or len(words) < k:
        return []
    return list({" ".join(words[i:i+k]) for i in range(len(words)-k+1)})

shingles_udf = udf(lambda w: make_shingles(w, 3), ArrayType(StringType()))
df = df.withColumn("shingles", shingles_udf(col("filtered_tokens"))).filter(F.size(col("shingles")) > 0)

In [13]:
def find_similar(input_df, threshold=0.5, num_hash=20):
    unique_terms = input_df.select(explode("shingles")).distinct().count()
    features_dim = 2 ** math.ceil(math.log2(unique_terms))

    tf = HashingTF(inputCol="shingles", outputCol="features", numFeatures=features_dim, binary=True)
    vectorized = tf.transform(input_df)

    lsh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=num_hash)
    model = lsh.fit(vectorized)

    candidates = model.approxSimilarityJoin(vectorized, vectorized, threshold=threshold, distCol="jaccard_dist")
    pairs = (candidates
             .filter(col("datasetA.Id") > col("datasetB.Id"))
             .withColumn("similarity", 1 - col("jaccard_dist"))
             .select("similarity",
                     col("datasetA.Id").alias("docA"),
                     col("datasetB.Id").alias("docB"),
                     col("datasetA.review_clean").alias("reviewA"),
                     col("datasetB.review_clean").alias("reviewB")))
    return pairs

In [14]:
#filtered by sentiment
pos_reviews = df.filter(col("sentiment") == "positive")
neg_reviews = df.filter(col("sentiment") == "negative")

# similar pairs MinHash LSH
pairs_pos = find_similar(pos_reviews, threshold=0.5)
pairs_neg = find_similar(neg_reviews, threshold=0.5)

# filter (0.8 - 0.95)
mod_pos = pairs_pos.filter((col("similarity") >= 0.8) & (col("similarity") <= 0.95))
mod_neg = pairs_neg.filter((col("similarity") >= 0.8) & (col("similarity") <= 0.95))

# Step 4: Print summary counts
print(f" Total similar positive pairs: {pairs_pos.count()}")
print(f" Total similar negative pairs: {pairs_neg.count()}")
print(f" similar positive pairs (0.8 - 0.95): {mod_pos.count()}")
print(f" similar negative pairs (0.8 - 0.95): {mod_neg.count()}")

# Step 5: Show examples of top similar pairs
print("\n 5 most similar positive pairs:")
pairs_pos.orderBy(col("similarity").desc()).show(5, truncate=False)

print("\n 5 most similar negative pairs:")
pairs_neg.orderBy(col("similarity").desc()).show(5, truncate=False)

print("\n 5 moderately similar positive pairs:")
mod_pos.orderBy(col("similarity").desc()).show(5, truncate=False)

print("\n 5 moderately similar negative pairs:")
mod_neg.orderBy(col("similarity").desc()).show(5, truncate=False)
mod_neg.orderBy(col("similarity").desc()).show(5, truncate=False)

Coppie simili positive: 6291
Coppie simili negative: 1366

📋 TOP 5 coppie positive simili:
+----------+----------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------