In [1]:
from pyspark.sql import SparkSession
import numpy as np
import os

spark = SparkSession.builder \
        .appName("Hackathon") \
        .master("local[*]") \
        .getOrCreate()

pathing_review = "datasets/review_data/"
arr = np.array(os.listdir(pathing_review))
reviewData_files = pathing_review + arr

pathing_metadata = "datasets/review_metadata/"
arr = np.array(os.listdir(pathing_metadata))
reviewMetadata_files = pathing_metadata + arr

df_review = spark.read.json(list(reviewData_files))
df_metadata = spark.read.json(list(reviewMetadata_files))

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/26 22:13:14 WARN Utils: Your hostname, Asyrafs-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 192.168.18.78 instead (on interface en0)
25/08/26 22:13:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/26 22:13:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/26 22:13:15 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

## DataFrame for Review Data

In [2]:
df_review = df_review.dropna().drop_duplicates()

In [3]:
df_review.count()

                                                                                

548055

In [4]:
df_review.printSchema()

root
 |-- gmap_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- pics: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- url: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |-- rating: long (nullable = true)
 |-- resp: struct (nullable = true)
 |    |-- text: string (nullable = true)
 |    |-- time: long (nullable = true)
 |-- text: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)



In [5]:
from pyspark.sql import functions as F
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover

# =====================================================
# STEP 1: Clean the 'resp.text' field (Business Responses)
# =====================================================
df_review = df_review.withColumn("resp_text", F.col("resp.text"))
# remove (Translated by Google)
df_review = df_review.withColumn("resp_text", F.regexp_replace("resp_text", r'\(Translated by Google\)', ''))
# remove (Original) and everything after
df_review = df_review.withColumn("resp_text", F.regexp_replace("resp_text", r'\(Original\).*', ''))
# replace newlines with spaces
df_review = df_review.withColumn("resp_text", F.regexp_replace("resp_text", r'[\r\n]+', ' '))
# trim extra spaces
df_review = df_review.withColumn("resp_text", F.trim("resp_text"))

# =====================================================
# STEP 2: Clean the 'text' column (Customer Reviews)
# =====================================================
df_review = df_review.withColumn("review_text", F.col("text"))
# remove (Translated by Google) if present
df_review = df_review.withColumn("review_text", F.regexp_replace("review_text", r'\(Translated by Google\)', ''))
# remove (Original) and everything after
df_review = df_review.withColumn("review_text", F.regexp_replace("review_text", r'\(Original\).*', ''))
# replace newlines with spaces
df_review = df_review.withColumn("review_text", F.regexp_replace("review_text", r'[\r\n]+', ' '))
# trim extra spaces
df_review = df_review.withColumn("review_text", F.trim("review_text"))

# =====================================================
# STEP 3: Tokenization for Customer Reviews
# =====================================================
review_tokenizer = RegexTokenizer(inputCol="review_text", outputCol="review_tokens", pattern="\\W")
df_tokens = review_tokenizer.transform(df_review)

# =====================================================
# STEP 4: Tokenization for Business Responses
# =====================================================
resp_tokenizer = RegexTokenizer(inputCol="resp_text", outputCol="resp_tokens", pattern="\\W")
df_tokens = resp_tokenizer.transform(df_tokens)

# =====================================================
# STEP 5: Stopword Removal for Customer Reviews
# =====================================================
review_remover = StopWordsRemover(inputCol="review_tokens", outputCol="review_filtered_tokens")
df_clean = review_remover.transform(df_tokens)

# =====================================================
# STEP 6: Stopword Removal for Business Responses
# =====================================================
resp_remover = StopWordsRemover(inputCol="resp_tokens", outputCol="resp_filtered_tokens")
df_clean = resp_remover.transform(df_clean)

# =====================================================
# STEP 7: TF-IDF for Customer Reviews
# =====================================================
from pyspark.ml.feature import HashingTF, IDF

# HashingTF for customer reviews
review_hashingTF = HashingTF(inputCol="review_filtered_tokens", outputCol="review_rawFeatures")
review_featurized = review_hashingTF.transform(df_clean)

# IDF for customer reviews
review_idf = IDF(inputCol="review_rawFeatures", outputCol="review_features")
review_idfModel = review_idf.fit(review_featurized)
df_with_review_features = review_idfModel.transform(review_featurized)

# =====================================================
# STEP 8: TF-IDF for Business Responses
# =====================================================
# HashingTF for business responses
resp_hashingTF = HashingTF(inputCol="resp_filtered_tokens", outputCol="resp_rawFeatures")
resp_featurized = resp_hashingTF.transform(df_with_review_features)

# IDF for business responses
resp_idf = IDF(inputCol="resp_rawFeatures", outputCol="resp_features")
resp_idfModel = resp_idf.fit(resp_featurized)
df_final = resp_idfModel.transform(resp_featurized)


                                                                                

In [6]:
# =====================================================
# Final Result - Both columns with TF-IDF features
# =====================================================
df_final.select("review_text", "review_features", "resp_text", "resp_features").show()

25/08/26 22:20:02 WARN DAGScheduler: Broadcasting large task binary with size 8.1 MiB
[Stage 18:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+
|         review_text|     review_features|           resp_text|       resp_features|
+--------------------+--------------------+--------------------+--------------------+
|The boat is exqui...|(262144,[18176,22...|Thank you Laurie!...|(262144,[8538,155...|
|My fiance's logbo...|(262144,[7994,124...|thanks Megan ,I d...|(262144,[57938,64...|
|Robert Valintine ...|(262144,[19546,19...|We appreciate the...|(262144,[69926,12...|
|Wonderful,  very ...|(262144,[6213,141...|        Thank you!!!|(262144,[81783],[...|
|WILL NOT SHOP HER...|(262144,[161,1164...|Hi Chris. Sorry t...|(262144,[3910,315...|
|We did my daughte...|(262144,[3928,497...|Hello Angela. Tha...|(262144,[21823,43...|
|Wonderful custome...|(262144,[15585,43...|Thank you very mu...|(262144,[61899,76...|
|Thanks Eddie for ...|(262144,[43237,55...|         Thanks jake|(262144,[64358,74...|
|Great place to ge...|(262144,[11018,32...|Thanks for 

                                                                                

## DataFrame Review Metadata

In [14]:
df_metadata.show(truncate=12)

+------------+------------+----------+------------+-----------+------------+------------+------------+------------+------------+--------------+-----+----------------+------------+------------+
|        MISC|     address|avg_rating|    category|description|     gmap_id|       hours|    latitude|   longitude|        name|num_of_reviews|price|relative_results|       state|         url|
+------------+------------+----------+------------+-----------+------------+------------+------------+------------+------------+--------------+-----+----------------+------------+------------+
|{[Wheelch...|NTK OUTDO...|       5.0|[Corporat...|       NULL|0x88d9beb...|[[Thursda...|  25.7952041| -80.3660381| NTK OUTDOOR|            35| NULL|    [0x88d9bf...|Open ⋅ Cl...|https://w...|
|        NULL|Cruises I...|       5.0|[Cruise a...|       NULL|0x88c2e49...|[[Thursda...|27.832186...|  -82.704805|Cruises I...|             2| NULL|    [0x88c2fd...|Open ⋅ Cl...|https://w...|
|        NULL|Seminole ...|       3

In [8]:
selected_metadata_columns_df = df_metadata.withColumnRenamed("name", "business_name").select(["gmap_id", "business_name", "address", "avg_rating", "category", "MISC"])

## Final DataFrame

In [9]:
final_df = df_final.join(selected_metadata_columns_df, on="gmap_id", how="left")

In [10]:
final_df.count()

                                                                                

548277

In [11]:
# spark.stop()