In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list, concat_ws, lower
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.linalg import DenseVector
from pyspark.ml import Pipeline
import numpy as np
import utils.config as config


# Step 1: Create Spark session
spark = SparkSession.builder \
    .appName("BusinessSimilarityModel") \
    .getOrCreate()

In [2]:
# Step 2: Load review data
# Assuming review_df is loaded with columns including 'name' and 'text'
review_spark_df = spark.read.json(config.PHILADELPHIA)
review_spark_df.count()

687307

In [3]:
# Step 3: Aggregate all review text per business
business_reviews = review_spark_df.groupBy("name") \
    .agg(concat_ws(" ", collect_list("text")).alias("all_reviews"))

In [4]:
business_reviews.count()

5074

In [5]:
# Step 4: Text preprocessing and TF-IDF
tokenizer = Tokenizer(inputCol="all_reviews", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf])
model = pipeline.fit(business_reviews)
tfidf_result = model.transform(business_reviews)

In [6]:
# Step 5: Collect features and names to compute cosine similarity
vector_data = tfidf_result.select("name", "features").collect()
name_vec_map = {row['name']: row['features'] for row in vector_data}

In [7]:
# Step 6: Define cosine similarity function
def cosine_sim(v1, v2):
    dot = float(v1.dot(v2))
    norm1 = float(v1.norm(2))
    norm2 = float(v2.norm(2))
    return dot / (norm1 * norm2) if norm1 != 0 and norm2 != 0 else 0.0

In [8]:
# Step 7: Find similar businesses
def find_similar_businesses(input_name, top_n=3):
    input_vec = name_vec_map.get(input_name)
    if input_vec is None:
        return f"Business '{input_name}' not found."

    similarities = []
    for name, vec in name_vec_map.items():
        if name != input_name:
            sim = cosine_sim(input_vec, vec)
            similarities.append((name, sim))
    
    sorted_sims = sorted(similarities, key=lambda x: x[1], reverse=True)
    return sorted_sims[:top_n]

In [9]:
# Example usage
print(find_similar_businesses("Real Food Eatery"))

[("Farmer's Keep", 0.8203797184807501), ('Herban Quality Eats', 0.810678538053316), ('Fuel', 0.7689759620913105)]
