# 🍽️ Hybrid Restaurant Recommendation System using PySpark


This notebook implements a **hybrid recommendation system** using the Yelp dataset filtered for Philadelphia.  
We combine collaborative filtering (ALS model) with content-based filtering (TF-IDF on business categories), enhanced by cosine similarity for hybrid re-ranking.



| Step              | Purpose                                                     |
|-------------------|-------------------------------------------------------------|
| **ALS Model**     | Learns collaborative signals (user–business preferences)    |
| **TF-IDF**        | Captures business similarity based on categories            |
| **Cosine Similarity** | Matches recommended businesses with user's taste       |
| **Hybrid Scoring**| Combines both to boost recommendation relevance             |


In [1]:
import sys
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode, collect_list, concat_ws, avg
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF, Normalizer, StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
import utils.config as config

os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-22'
os.environ['SPARK_HOME'] = r'D:\Aditya\Graduate Materials\Spring 2026\Big Data\Project\spark'
# os.environ['SPARK_HOME'] = config.APP
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python3'
os.environ['PYSPARK_PYTHON'] = sys.executable 

# Spark config 
spark = SparkSession.builder \
    .appName("HybridRecommender") \
    .config("spark.python.worker.reuse", "false") \
    .config("spark.network.timeout", "800s") \
    .getOrCreate()


# Load data

review_df = spark.read.json(config.PHILADELPHIA)
user_df = spark.read.json(config.USER)


In [2]:
# Extract business metadata
business_df = review_df.select("business_id", "categories", "business_stars").dropna(subset=["categories"])
business_df = business_df.withColumn("category", explode(split(col("categories"), ",\\s*")))

business_grouped = business_df.groupBy("business_id", "business_stars") \
    .agg(concat_ws(" ", collect_list("category")).alias("category_text"))

# TF-IDF pipeline
tokenizer = Tokenizer(inputCol="category_text", outputCol="words")
vectorizer = CountVectorizer(inputCol="words", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="tfidf_features")
normalizer = Normalizer(inputCol="tfidf_features", outputCol="norm_features")

pipeline = Pipeline(stages=[tokenizer, vectorizer, idf, normalizer])
tfidf_model = pipeline.fit(business_grouped)
tfidf_business = tfidf_model.transform(business_grouped).select("business_id", "business_stars", "norm_features")

In [3]:
business_df.head()

Row(business_id='-0eUa8TsXFFy0FCxHYmrjg', categories='Caterers, Sandwiches, Delis, Restaurants, Cafes, Event Planning & Services', business_stars=4.0, category='Caterers')

In [4]:

# Index users and businesses
indexer_user = StringIndexer(inputCol="user_id", outputCol="user_index")
indexer_business = StringIndexer(inputCol="business_id", outputCol="business_index")
indexed_model = Pipeline(stages=[indexer_user, indexer_business]).fit(review_df)

indexed_data = indexed_model.transform(review_df).select("user_index", "business_index", "business_stars")

# Train ALS model
als = ALS(userCol="user_index", itemCol="business_index", ratingCol="business_stars",
          coldStartStrategy="drop", nonnegative=True, implicitPrefs=False,
          rank=10, maxIter=10, regParam=0.1)
als_model = als.fit(indexed_data)


In [5]:

# Top-N ALS
user_recs = als_model.recommendForAllUsers(10)
recs_exploded = user_recs.withColumn("rec", explode("recommendations")) \
    .select("user_index", col("rec.business_index").alias("business_index"), col("rec.rating").alias("als_score"))

# User history (positive feedback)
user_history = review_df.select("user_id", "business_id", "business_stars").filter("business_stars >= 4")
user_history = indexed_model.transform(user_history).select("user_index", "business_index", "business_id")

# Join TF-IDF
als_with_tfidf = recs_exploded.join(tfidf_business, recs_exploded.business_index == tfidf_business.business_id)
user_history_with_tfidf = user_history.join(tfidf_business, "business_id")


In [6]:

from pyspark.ml.linalg import DenseVector
from pyspark.sql.types import ArrayType, DoubleType
from pyspark.sql.functions import udf

# Convert VectorUDT to array
vector_to_array_udf = udf(lambda v: v.toArray().tolist() if v is not None else [], ArrayType(DoubleType()))
als_with_tfidf = als_with_tfidf.withColumn("als_array", vector_to_array_udf("norm_features"))
user_history_with_tfidf = user_history_with_tfidf.withColumn("uh_array", vector_to_array_udf("norm_features"))

# Cross join user history with recommendations
user_cross = user_history_with_tfidf.alias("uh").join(
    als_with_tfidf.alias("als"),
    col("uh.user_index") == col("als.user_index")
).select(
    col("als.user_index").alias("user_index"),
    col("als.business_id").alias("rec_business"),
    col("als.als_score"),
    col("uh.business_id").alias("hist_business"),
    col("uh.uh_array"),
    col("als.als_array")
)

# Compute cosine similarity (dot product since vectors are normalized)
user_cross = user_cross.withColumn(
    "similarity",
    F.expr("aggregate(zip_with(uh_array, als_array, (x, y) -> x * y), 0D, (acc, x) -> acc + x)")
)


In [7]:

# Average similarity score
avg_sim = user_cross.groupBy("user_index", "rec_business", "als_score") \
    .agg(avg("similarity").alias("avg_similarity"))

# Combine ALS score and similarity
final_recs = avg_sim.withColumn(
    "final_score", 0.7 * col("als_score") + 0.3 * col("avg_similarity")
).orderBy("user_index", col("final_score").desc())

final_recs.show()


+----------+------------+---------+--------------+-----------+
|user_index|rec_business|als_score|avg_similarity|final_score|
+----------+------------+---------+--------------+-----------+
+----------+------------+---------+--------------+-----------+

