In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, BucketedRandomProjectionLSH
from pyspark.ml import Pipeline
import utils.config as config


In [2]:
spark = SparkSession.builder.appName("KNNBusinessSimilarityModel").getOrCreate()


In [3]:
%%time

review_spark_df = spark.read.json(config.PHILADELPHIA)
review_spark_df.select("name", "text").show(5, truncate=False)


+------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
%%time

business_reviews = review_spark_df.groupBy("name") \
    .agg(concat_ws(" ", collect_list("text")).alias("all_reviews"))
# business_reviews.show(5, truncate=False)
# Show just 5 results, and truncate long strings
business_reviews.show(5, truncate=False)

+--------------------+--------------------+
|                name|         all_reviews|
+--------------------+--------------------+
|        1 Stop Pizza|Unexpectedly good...|
|1225Raw Sushi and...|Great sushi place...|
|                1601|Best turkey reube...|
|   2nd Story Brewing|I thought that th...|
|3 Brothers Pizza ...|The best pizza in...|
+--------------------+--------------------+
only showing top 5 rows

CPU times: total: 15.6 ms
Wall time: 8.16 s


In [6]:
%%time

tokenizer = Tokenizer(inputCol="all_reviews", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf])
model = pipeline.fit(business_reviews)
tfidf_result = model.transform(business_reviews)


CPU times: total: 109 ms
Wall time: 1min 13s


In [10]:
tfidf_result.count()

5074

In [7]:
%%time

feature_df = tfidf_result.select("name", "features")
lsh = BucketedRandomProjectionLSH(
    inputCol="features",
    outputCol="hashes",
    numHashTables=5,
    bucketLength=2.0  # 🔧 Set a value that works with your vector scale
)

lsh_model = lsh.fit(feature_df)
transformed = lsh_model.transform(feature_df)


CPU times: total: 0 ns
Wall time: 212 ms


In [12]:
feature_df.show(5)

+--------------------+--------------------+
|                name|            features|
+--------------------+--------------------+
|        1 Stop Pizza|(1000,[1,5,7,9,11...|
|1225Raw Sushi and...|(1000,[0,1,2,3,4,...|
|                1601|(1000,[0,2,4,7,8,...|
|   2nd Story Brewing|(1000,[0,1,2,3,4,...|
|3 Brothers Pizza ...|(1000,[0,2,3,5,7,...|
+--------------------+--------------------+
only showing top 5 rows



In [11]:
%%time

query_name = "Real Food Eatery"
query_vec = feature_df.filter(col("name") == query_name).collect()

if query_vec:
    neighbors = lsh_model.approxNearestNeighbors(transformed, query_vec[0]['features'], numNearestNeighbors=10)
    neighbors.select("name", "distCol").show(truncate=False)
else:
    print(f"Business '{query_name}' not found.")


+--------------------------+------------------+
|name                      |distCol           |
+--------------------------+------------------+
|Real Food Eatery          |0.0               |
|Farmer's Keep             |136.09511236513654|
|Agno Grill                |139.99290898560818|
|Pure Fare                 |143.50845480511708|
|Fuel                      |146.870395107372  |
|Noble: An American Cookery|149.89536940184456|
|Chloe                     |151.95053535697753|
|Meritage                  |152.30786143905053|
|Rice & Mix                |153.02981552271055|
|Fitler Dining Room        |154.74703677275184|
+--------------------------+------------------+

CPU times: total: 62.5 ms
Wall time: 1min 41s
