# KNN Business Similarity Model using PySpark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat_ws, collect_list
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, BucketedRandomProjectionLSH
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
import utils.config as config
import sys
import os

In [2]:
os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-22'
os.environ['SPARK_HOME'] = config.APP
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python3'
os.environ['PYSPARK_PYTHON'] = sys.executable 

# Initialize Spark session
spark = SparkSession.builder.appName("KNNBusinessSimilarityModel").getOrCreate()

In [3]:
%%time

# Load review data
review_spark_df = spark.read.json(config.PHILADELPHIA)  # Replace with actual file path
review_spark_df.select("name", "categories").show(5, truncate=False)

# Group by business name and collect category text
business_reviews = review_spark_df.groupBy("name") \
    .agg(concat_ws(" ", collect_list("categories")).alias("categories"))
business_reviews.show(5, truncate=False)

+------------------------------+--------------------------------------------------------------------------+
|name                          |categories                                                                |
+------------------------------+--------------------------------------------------------------------------+
|Waterfront Gourmet Cafe & Deli|Caterers, Sandwiches, Delis, Restaurants, Cafes, Event Planning & Services|
|Waterfront Gourmet Cafe & Deli|Caterers, Sandwiches, Delis, Restaurants, Cafes, Event Planning & Services|
|Waterfront Gourmet Cafe & Deli|Caterers, Sandwiches, Delis, Restaurants, Cafes, Event Planning & Services|
|Waterfront Gourmet Cafe & Deli|Caterers, Sandwiches, Delis, Restaurants, Cafes, Event Planning & Services|
|Waterfront Gourmet Cafe & Deli|Caterers, Sandwiches, Delis, Restaurants, Cafes, Event Planning & Services|
+------------------------------+--------------------------------------------------------------------------+
only showing top 5 rows

+--

In [4]:
# Define TF-IDF feature extraction pipeline
tokenizer = Tokenizer(inputCol="categories", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf])
model = pipeline.fit(business_reviews)
tfidf_result = model.transform(business_reviews)
feature_df = tfidf_result.select("name", "features")

In [5]:
# Train LSH model on TF-IDF features
lsh = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", numHashTables=5, bucketLength=2.0)
lsh_model = lsh.fit(feature_df)

In [6]:
def recommend_similar_businesses_by_name(business_name, business_reviews, lsh_model, pipeline_model, feature_df, top_n=5):
    # Step 1: Get category string for the given business name
    category_row = business_reviews.filter(business_reviews.name == business_name).select("categories").collect()
    
    if not category_row:
        print(f"No business found with name: {business_name}")
        return None

    category_input = category_row[0]["categories"]

    # Step 2: Create DataFrame with categories
    input_df = spark.createDataFrame([(category_input,)], ["categories"])

    # Step 3: Transform with pipeline to get TF-IDF features
    input_transformed = pipeline_model.transform(input_df)
    input_feature = input_transformed.select("features").first()["features"]

    # Step 4: Perform LSH similarity search
    similar = lsh_model.approxNearestNeighbors(dataset=feature_df, key=input_feature, numNearestNeighbors=top_n + 1)  # +1 to exclude self

    # Step 5: Filter out the original business itself
    similar_filtered = similar.filter(similar.name != business_name)

    return similar_filtered.select("name", "distCol")


In [13]:
%%time
recommend_similar_businesses_by_name("Subway", business_reviews, lsh_model, model, feature_df).show(truncate=False)

+------------------+------------------+
|name              |distCol           |
+------------------+------------------+
|Jersey Mike's Subs|452.42292812468804|
|Beefsteak         |557.5199384417533 |
|Quiznos           |576.7804962109332 |
|Eat At Joe's      |599.9883056837965 |
|Auntie Anne's     |611.5830272706266 |
+------------------+------------------+

CPU times: total: 0 ns
Wall time: 13.8 s
