## Loading the data

In [1]:
import numpy as np
import pandas as pd
import ast
import os
import zipfile

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace
from pyspark.ml.feature import MinHashLSH, HashingTF
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

In [None]:

os.environ['KAGGLE_USERNAME'] = "aidanaakkaziyeva"
os.environ['KAGGLE_KEY'] = "609e0e320a0900d9d1865319a498c843"
!kaggle datasets download -d mohamedbakhet/amazon-books-reviews

Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0


In [5]:
# Define the zip file name
zip_file = "amazon-books-reviews.zip"

# Extract all contents
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall("amazon-books-reviews")  # Extracts into a folder

In [6]:
os.listdir("amazon-books-reviews")

['books_data.csv', 'Books_rating.csv']

In [3]:
from pyspark.sql import SparkSession

In [4]:
os.environ["PYSPARK_PYTHON"] = r"C:\Users\ACER\anaconda3\envs\pyspark310\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\ACER\anaconda3\envs\pyspark310\python.exe"

In [5]:
spark = SparkSession.builder \
    .appName("BooksReviewSimilarity") \
    .master("local[*]") \
    .getOrCreate()

In [6]:
#books_data = spark.read.csv("amazon-books-reviews/books_data.csv", header=True, inferSchema=True)
books_rating = spark.read.csv("amazon-books-reviews/Books_rating.csv", header=True, inferSchema=True)

## Preprocessing the Books Data

In [7]:
#books_data.show(5)

In [8]:
# how many rows and columns in the dataset
#books_data.count(), len(books_data.columns)

In [9]:
# keep only 1,3,7 columns
#books_data = books_data.select("Title", "authors")

In [10]:
books_rating.show(5)

+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|        Id|               Title|Price|       User_id|         profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|
+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|1882931173|Its Only Art If I...| NULL| AVCGYZL8FQQTD|"Jim of Oz ""jim-...|               7/7|         4.0|  940636800|Nice collection o...|This is only for ...|
|0826414346|Dr. Seuss: Americ...| NULL|A30TK6U7DNS82R|       Kevin Killian|             10/10|         5.0| 1095724800|   Really Enjoyed It|I don't care much...|
|0826414346|Dr. Seuss: Americ...| NULL|A3UH4UZ4RSVO82|        John Granger|             10/11|         5.0| 1078790400|Essential for eve...|"If people become...|
|0826414346|Dr. Seuss: Ameri

In [11]:
books_rating.count(), len(books_rating.columns)

(3000000, 10)

In [12]:
books_rating = books_rating.filter(books_rating["review/text"].isNotNull())

In [13]:
from pyspark.sql.functions import length, avg

# Compute average length of 'review/text' and 'review/summary'
books_rating.select(
    avg(length("review/text")).alias("avg_text_length"),
    avg(length("review/summary")).alias("avg_summary_length")
).show()

+----------------+------------------+
| avg_text_length|avg_summary_length|
+----------------+------------------+
|613.245675188011|27.950346992702137|
+----------------+------------------+



In [14]:
# keep only 2,4,10 columns
books_rating = books_rating.select("Title", "User_id", "review/text")

In [15]:
# subsample the dataset (1%)
books_rating = books_rating.sample(0.01, seed=42)
books_rating.count(), len(books_rating.columns)

(30332, 3)

In [16]:
books_rating.show(5)

+--------------------+--------------+--------------------+
|               Title|       User_id|         review/text|
+--------------------+--------------+--------------------+
|King James: Belie...|          NULL|"LeBron James is ...|
|Open marriage;: A...|A3KBF2S2MGN48O|This book is a cl...|
|Night World: Daug...|          NULL|The plot and char...|
|The soul of man u...| AMVC9WTXYKNJ1|Although the titl...|
|Close to Home Rev...|A3QXDJQIHNGKMH|This book is the ...|
+--------------------+--------------+--------------------+
only showing top 5 rows



## clean

In [17]:
df_clean = books_rating.withColumn("clean_review", lower(regexp_replace(col("review/text"), "[^a-zA-Z0-9\\s]", "")))

In [38]:
df_clean.show(5)

+--------------------+--------------+--------------------+--------------------+
|               Title|       User_id|         review/text|        clean_review|
+--------------------+--------------+--------------------+--------------------+
|King James: Belie...|          NULL|"LeBron James is ...|lebron james is a...|
|Open marriage;: A...|A3KBF2S2MGN48O|This book is a cl...|this book is a cl...|
|Night World: Daug...|          NULL|The plot and char...|the plot and char...|
|The soul of man u...| AMVC9WTXYKNJ1|Although the titl...|although the titl...|
|Close to Home Rev...|A3QXDJQIHNGKMH|This book is the ...|this book is the ...|
+--------------------+--------------+--------------------+--------------------+
only showing top 5 rows



## shingling

In [18]:
# UDF to generate character-level 3-grams (shingles)
def generate_shingles(text, k=3):
    if text is None:
        return []
    text = text.replace(" ", "")  # optional: remove spaces for tighter shingles
    return [text[i:i+k] for i in range(len(text) - k + 1)]

shingle_udf = udf(generate_shingles, ArrayType(StringType()))

df_shingled = df_clean.withColumn("shingles", shingle_udf(col("clean_review")))

In [39]:
df_shingled.show(5)

+--------------------+--------------+--------------------+--------------------+--------------------+
|               Title|       User_id|         review/text|        clean_review|            shingles|
+--------------------+--------------+--------------------+--------------------+--------------------+
|King James: Belie...|          NULL|"LeBron James is ...|lebron james is a...|[leb, ebr, bro, r...|
|Open marriage;: A...|A3KBF2S2MGN48O|This book is a cl...|this book is a cl...|[thi, his, isb, s...|
|Night World: Daug...|          NULL|The plot and char...|the plot and char...|[the, hep, epl, p...|
|The soul of man u...| AMVC9WTXYKNJ1|Although the titl...|although the titl...|[alt, lth, tho, h...|
|Close to Home Rev...|A3QXDJQIHNGKMH|This book is the ...|this book is the ...|[thi, his, isb, s...|
+--------------------+--------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [19]:
# Hash the shingles into features using HashingTF
hasher = HashingTF(inputCol="shingles", outputCol="features", numFeatures=1000)
df_hashed = hasher.transform(df_shingled)

In [42]:
df_hashed.show(5)

+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+
|               Title|       User_id|         review/text|        clean_review|            shingles|            features|
+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+
|King James: Belie...|          NULL|"LeBron James is ...|lebron james is a...|[leb, ebr, bro, r...|(1000,[2,4,6,8,9,...|
|Open marriage;: A...|A3KBF2S2MGN48O|This book is a cl...|this book is a cl...|[thi, his, isb, s...|(1000,[0,1,2,3,4,...|
|Night World: Daug...|          NULL|The plot and char...|the plot and char...|[the, hep, epl, p...|(1000,[2,4,8,17,1...|
|The soul of man u...| AMVC9WTXYKNJ1|Although the titl...|although the titl...|[alt, lth, tho, h...|(1000,[0,3,8,11,1...|
|Close to Home Rev...|A3QXDJQIHNGKMH|This book is the ...|this book is the ...|[thi, his, isb, s...|(1000,[0,7,12,17,...|
+--------------------+--

In [None]:
from pyspark.sql.functions import length, col, trim

df_filtered = df_hashed.filter(length(trim(col("review/text"))) > 0)


In [57]:
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import DoubleType
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType

# Define UDF that returns the number of non-zero values in the vector
@udf(returnType=IntegerType())
def count_nonzero(v):
    return int(v.numNonzeros())  # works for both SparseVector and DenseVector

# Apply it
books_rating_filtered = df_hashed.withColumn("nonZero", count_nonzero("features")) \
                                 .filter("nonZero > 0") \
                                 .drop("nonZero")

In [58]:
# Initialize MinHashLSH
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=3)
model = mh.fit(books_rating_filtered)

In [59]:
# Approximate similarity join within the same dataset
similar_reviews = model.approxSimilarityJoin(
    books_rating_filtered,
    books_rating_filtered,
    threshold=0.6,  # Jaccard distance threshold
    distCol="JaccardDistance"
)

In [60]:
# Optional: filter out self-matches (a review matched with itself)
similar_reviews = similar_reviews.filter(col("datasetA.review/text") != col("datasetB.review/text"))

In [61]:
# Show results
similar_reviews.select("datasetA.review/text", "datasetB.review/text", "JaccardDistance").show(truncate=False)

Py4JJavaError: An error occurred while calling o598.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 26.0 failed 1 times, most recent failure: Lost task 1.0 in stage 26.0 (TID 133) (100.65.113.22 executor driver): org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`LSHModel$$Lambda$3894/0x00000001015f7840`: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => array<struct<type:tinyint,size:int,indices:array<int>,values:array<double>>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:1160)
	at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:1176)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1214)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:322)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$PythonUDFWriterThread.writeIteratorToStream(PythonUDFRunner.scala:58)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:451)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1928)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:282)
Caused by: java.lang.IllegalArgumentException: requirement failed: Must have at least 1 non zero entry.
	at scala.Predef$.require(Predef.scala:281)
	at org.apache.spark.ml.feature.MinHashLSHModel.hashFunction(MinHashLSH.scala:61)
	at org.apache.spark.ml.feature.LSHModel.$anonfun$transform$1(LSH.scala:99)
	... 20 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`LSHModel$$Lambda$3894/0x00000001015f7840`: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => array<struct<type:tinyint,size:int,indices:array<int>,values:array<double>>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:1160)
	at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:1176)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1214)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:322)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$PythonUDFWriterThread.writeIteratorToStream(PythonUDFRunner.scala:58)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:451)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1928)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:282)
Caused by: java.lang.IllegalArgumentException: requirement failed: Must have at least 1 non zero entry.
	at scala.Predef$.require(Predef.scala:281)
	at org.apache.spark.ml.feature.MinHashLSHModel.hashFunction(MinHashLSH.scala:61)
	at org.apache.spark.ml.feature.LSHModel.$anonfun$transform$1(LSH.scala:99)
	... 20 more
