## Loading the data

In [1]:
import numpy as np
import pandas as pd
import ast
import os
import zipfile

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace
from pyspark.ml.feature import MinHashLSH, HashingTF
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

In [None]:

os.environ['KAGGLE_USERNAME'] = "aidanaakkaziyeva"
os.environ['KAGGLE_KEY'] = "609e0e320a0900d9d1865319a498c843"
!kaggle datasets download -d mohamedbakhet/amazon-books-reviews

Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0


In [5]:
# Define the zip file name
zip_file = "amazon-books-reviews.zip"

# Extract all contents
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall("amazon-books-reviews")  # Extracts into a folder

In [6]:
os.listdir("amazon-books-reviews")

['books_data.csv', 'Books_rating.csv']

In [3]:
from pyspark.sql import SparkSession

In [4]:
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-11"  # Replace with your Java installation path

In [5]:
os.environ["PYSPARK_PYTHON"] = r"C:\Users\ACER\anaconda3\envs\pyspark310\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\ACER\anaconda3\envs\pyspark310\python.exe"

In [6]:
spark = SparkSession.builder \
    .appName("BooksReviewSimilarity") \
    .master("local[*]") \
    .getOrCreate()

In [7]:
#books_data = spark.read.csv("amazon-books-reviews/books_data.csv", header=True, inferSchema=True)
books_rating = spark.read.csv("amazon-books-reviews/Books_rating.csv", header=True, inferSchema=True)

## Preprocessing the Books Data

In [8]:
#books_data.show(5)

In [9]:
# how many rows and columns in the dataset
#books_data.count(), len(books_data.columns)

In [10]:
# keep only 1,3,7 columns
#books_data = books_data.select("Title", "authors")

In [11]:
books_rating.show(5)

+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|        Id|               Title|Price|       User_id|         profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|
+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|1882931173|Its Only Art If I...| NULL| AVCGYZL8FQQTD|"Jim of Oz ""jim-...|               7/7|         4.0|  940636800|Nice collection o...|This is only for ...|
|0826414346|Dr. Seuss: Americ...| NULL|A30TK6U7DNS82R|       Kevin Killian|             10/10|         5.0| 1095724800|   Really Enjoyed It|I don't care much...|
|0826414346|Dr. Seuss: Americ...| NULL|A3UH4UZ4RSVO82|        John Granger|             10/11|         5.0| 1078790400|Essential for eve...|"If people become...|
|0826414346|Dr. Seuss: Ameri

In [12]:
books_rating.count(), len(books_rating.columns)

(3000000, 10)

In [13]:
books_rating = books_rating.filter(books_rating["review/text"].isNotNull())

In [14]:
from pyspark.sql.functions import length, avg

# Compute average length of 'review/text' and 'review/summary'
books_rating.select(
    avg(length("review/text")).alias("avg_text_length"),
    avg(length("review/summary")).alias("avg_summary_length")
).show()

+----------------+------------------+
| avg_text_length|avg_summary_length|
+----------------+------------------+
|613.245675188011|27.950346992702137|
+----------------+------------------+



In [15]:
# keep only 2,4,10 columns
books_rating = books_rating.select("Title", "User_id", "review/text")

In [16]:
# subsample the dataset (1%)
books_rating = books_rating.sample(0.005, seed=42)
books_rating.count(), len(books_rating.columns)

(15189, 3)

In [17]:
books_rating.show(5)

+--------------------+--------------+--------------------+
|               Title|       User_id|         review/text|
+--------------------+--------------+--------------------+
|King James: Belie...|          NULL|"LeBron James is ...|
|Open marriage;: A...|A3KBF2S2MGN48O|This book is a cl...|
|Lost Cities of Af...|A1LG9LE8NN47CZ|There is no other...|
|Lincoln reconside...|A1E8EIKF5T05BO|"According to the...|
|Economics in one ...|A3W1J0KZJJPG5J|Teacher no longer...|
+--------------------+--------------+--------------------+
only showing top 5 rows



## clean

In [18]:
import re

In [19]:
# 1. Clean and preprocess the text data
def clean_text(text):
    if text is None:
        return ""
    # Basic cleaning - remove special chars, lowercase, etc.
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text.strip()

clean_udf = udf(clean_text, StringType())
df_clean = books_rating.withColumn("clean_text", clean_udf(col("review/text")))

In [20]:
df_clean = df_clean.filter(col("User_id").isNotNull())

In [21]:
# 2. Create shingles (n-grams)
def create_shingles(text, n=5):
    if len(text) < n:
        return [text]
    return [text[i:i+n] for i in range(len(text) - n + 1)]

shingle_udf = udf(create_shingles, ArrayType(StringType()))
df_shingled = df_clean.withColumn("shingles", shingle_udf(col("clean_text")))

In [22]:
df_shingled.show(5)

+--------------------+--------------+--------------------+--------------------+--------------------+
|               Title|       User_id|         review/text|          clean_text|            shingles|
+--------------------+--------------+--------------------+--------------------+--------------------+
|Open marriage;: A...|A3KBF2S2MGN48O|This book is a cl...|this book is a cl...|[this , his b, is...|
|Lost Cities of Af...|A1LG9LE8NN47CZ|There is no other...|there is no other...|[there, here , er...|
|Lincoln reconside...|A1E8EIKF5T05BO|"According to the...|according to the ...|[accor, ccord, co...|
|Economics in one ...|A3W1J0KZJJPG5J|Teacher no longer...|teacher no longer...|[teach, eache, ac...|
|      Edge of Danger|A3ARWQ6170EOKS|I was a big fan o...|i was a big fan o...|[i was,  was , wa...|
+--------------------+--------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [23]:
# 3. Create feature vectors (using hashing trick)
from pyspark.ml.feature import HashingTF

hashing_tf = HashingTF(inputCol="shingles", outputCol="rawFeatures", numFeatures=1024)
featurized_data = hashing_tf.transform(df_shingled)

In [24]:
featurized_data.show(5)

+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+
|               Title|       User_id|         review/text|          clean_text|            shingles|         rawFeatures|
+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+
|Open marriage;: A...|A3KBF2S2MGN48O|This book is a cl...|this book is a cl...|[this , his b, is...|(1024,[0,2,5,7,10...|
|Lost Cities of Af...|A1LG9LE8NN47CZ|There is no other...|there is no other...|[there, here , er...|(1024,[15,17,19,2...|
|Lincoln reconside...|A1E8EIKF5T05BO|"According to the...|according to the ...|[accor, ccord, co...|(1024,[5,7,12,25,...|
|Economics in one ...|A3W1J0KZJJPG5J|Teacher no longer...|teacher no longer...|[teach, eache, ac...|(1024,[9,20,22,31...|
|      Edge of Danger|A3ARWQ6170EOKS|I was a big fan o...|i was a big fan o...|[i was,  was , wa...|(1024,[2,3,6,9,10...|
+--------------------+--

In [25]:
# 4. Apply MinHash
mh = MinHashLSH(inputCol="rawFeatures", outputCol="hashes", numHashTables=5)
model = mh.fit(featurized_data)

In [26]:
# 5. Find similar reviews
similar_reviews = model.approxSimilarityJoin(
    featurized_data, featurized_data, 0.6, distCol="JaccardDistance"
)

In [27]:
# Filter out self-matches and show results
similar_pairs = similar_reviews.filter(
    col("datasetA.Title") != col("datasetB.Title")
).select(
    col("datasetA.Title").alias("title1"),
    col("datasetB.Title").alias("title2"),
    col("JaccardDistance")
)

similar_pairs.show()

Py4JJavaError: An error occurred while calling o153.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 14 in stage 15.0 failed 1 times, most recent failure: Lost task 14.0 in stage 15.0 (TID 110) (100.65.113.22 executor driver): TaskResultLost (result lost from block manager)
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.sql.execution.SparkPlan.executeCollectIterator(SparkPlan.scala:455)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.$anonfun$relationFuture$1(BroadcastExchangeExec.scala:140)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$2(SQLExecution.scala:224)
	at org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:94)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$1(SQLExecution.scala:219)
	at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)
