<a href="https://colab.research.google.com/github/GHMelany/AMD_project/blob/main/AMD%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

PROJECT: Finding similar items. Implement a detector of pairs of similar book reviews


LIBRARIES

In [1]:
!pip install pyspark



In [2]:
!pip install langid


Collecting langid
  Downloading langid-1.1.6.tar.gz (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langid
  Building wheel for langid (setup.py) ... [?25l[?25hdone
  Created wheel for langid: filename=langid-1.1.6-py3-none-any.whl size=1941171 sha256=93386bb0cea2fef1219f03423db7999459cdba82d24943b210de0e60b1243178
  Stored in directory: /root/.cache/pip/wheels/3c/bc/9d/266e27289b9019680d65d9b608c37bff1eff565b001c977ec5
Successfully built langid
Installing collected packages: langid
Successfully installed langid-1.1.6


In [3]:
import re, html, langid
import os
import zipfile
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StringType
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, NGram, HashingTF, MinHashLSH
from pyspark.ml import Pipeline


In [4]:
os.environ['KAGGLE_USERNAME'] = "melanygomez"
os.environ['KAGGLE_KEY'] = "38db1cce93622035560027022e9cafc"

!pip install -q kaggle

from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

!kaggle datasets download -d mohamedbakhet/amazon-books-reviews

Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0
Downloading amazon-books-reviews.zip to /content
 99% 1.05G/1.06G [00:07<00:00, 208MB/s]
100% 1.06G/1.06G [00:07<00:00, 147MB/s]


In [5]:
zip_path = "amazon-books-reviews.zip"
extract_dir = "amazon_books_reviews"
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_dir)

for root, dirs, files in os.walk(extract_dir):
    for file in files:
        print(os.path.join(root, file))

folder = "amazon_books_reviews"
csv_path = os.path.join(folder, "Books_rating.csv")



amazon_books_reviews/Books_rating.csv
amazon_books_reviews/books_data.csv


In [6]:
spark = SparkSession.builder.appName("ReviewSimilarityPipeline").getOrCreate()

In [7]:
data = spark.read.csv(csv_path, header=True, inferSchema=True, quote='"', escape='"')
data = data.select("Id", F.col("review/score").alias("score"), F.col("review/text").alias("text")).dropna(subset=["text"])

In [8]:
def preprocess(raw):
    if not raw: return ""
    txt = html.unescape(raw).lower()
    txt = re.sub(r"[^a-z0-9 ]+", " ", txt)
    return re.sub(r"\s+", " ", txt).strip()

def detect_lang(text):
    snippet = " ".join(text.split()[:50])
    lang, _ = langid.classify(snippet)
    return lang

def classify(score):
    if score > 3:
        return "positive"
    else:
        return "negative"


In [9]:
clean_udf = F.udf(preprocess, StringType())
lang_udf = F.udf(detect_lang, StringType())
sentiment_udf = F.udf(classify, StringType())

In [10]:
data = data.withColumn("clean_text", clean_udf("text")).dropDuplicates(["clean_text"])
data = data.withColumn("lang", lang_udf("clean_text")).filter(F.col("lang") == "en").drop("lang")
data = data.withColumn("sentiment", sentiment_udf("score"))


In [11]:
tokenize = RegexTokenizer(inputCol="clean_text", outputCol="tokens", pattern="\\W+")
remove_sw = StopWordsRemover(inputCol="tokens", outputCol="content_words")
prep_pipeline = Pipeline(stages=[tokenize, remove_sw])
data = prep_pipeline.fit(data).transform(data)

In [12]:
data = data.withColumn("length", F.size("content_words"))
data = data.filter((F.col("length") >= 20) & (F.col("length") <= 200))

In [13]:
reviews = data.select("Id", "score", "sentiment", "content_words")

In [None]:
vocab_size = reviews.select(F.explode("content_words")).distinct().count()
hash_space = 2 ** vocab_size.bit_length()

In [None]:
def find_similar_pairs(input_df, shingle_len=3, threshold=0.8, hash_tables=40):
    ngram_gen = NGram(n=shingle_len, inputCol="content_words", outputCol="shingles")
    df_shingled = ngram_gen.transform(input_df)

    tf = HashingTF(inputCol="shingles", outputCol="vector", numFeatures=hash_space, binary=True)
    vectorized = tf.transform(df_shingled)

    minhash = MinHashLSH(inputCol="vector", outputCol="signature", numHashTables=hash_tables)
    model = minhash.fit(vectorized)
    transformed = model.transform(vectorized)

    candidates = model.approxSimilarityJoin(transformed, transformed, distCol="jaccard_dist", threshold=1 - threshold)
    similar = (candidates
               .filter(F.col("datasetA.Id") < F.col("datasetB.Id"))
               .withColumn("similarity", 1 - F.col("jaccard_dist"))
               .select(
                   F.col("datasetA.Id").alias("doc1"),
                   F.col("datasetB.Id").alias("doc2"),
                   "similarity"
               ))

    return similar


In [None]:
pos_reviews = reviews.filter(F.col("sentiment") == "positive")
neg_reviews = reviews.filter(F.col("sentiment") == "negative")

pairs_pos = find_similar_pairs(pos_reviews, shingle_len=3, threshold=0.8)
pairs_neg = find_similar_pairs(neg_reviews, shingle_len=3, threshold=0.8)

In [None]:
print(f"[✓] Coppie simili tra recensioni positive: {pairs_pos.count()}")
print(f"[✓] Coppie simili tra recensioni negative: {pairs_neg.count()}")

In [None]:
print(f"[✓] Coppie simili tra recensioni positive: {pairs_pos.count()}")
print(f"[✓] Coppie simili tra recensioni negative: {pairs_neg.count()}")