In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
import utils.config as config
from pyspark.ml.feature import StringIndexer

import sys
import os
os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-17'
os.environ['SPARK_HOME'] = config.APP
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python3'
os.environ['PYSPARK_PYTHON'] = sys.executable 



spark = SparkSession.builder \
    .appName("SparkALSRecommendation") \
    .getOrCreate()

ratings = spark.read.json(config.PHILADELPHIA)  # or config.USER if defined
ratings = ratings.select("user_id", "name", "business_stars").dropna()
ratings = ratings.filter(ratings.business_stars > 0)
ratings.printSchema()
ratings.show(5)

# 🔹 Index users and restaurants
user_indexer = StringIndexer(inputCol="user_id", outputCol="userIndex")
item_indexer = StringIndexer(inputCol="name", outputCol="itemIndex")
ratings = user_indexer.fit(ratings).transform(ratings)
ratings = item_indexer.fit(ratings).transform(ratings)
from pyspark.ml.feature import StringIndexer

(train_data, test_data) = ratings.randomSplit([0.8, 0.2], seed=42)

als = ALS(
    userCol="userIndex",
    itemCol="itemIndex",
    ratingCol="business_stars",
    coldStartStrategy="drop",   # drops NaNs during prediction
    nonnegative=True,
    implicitPrefs=False
)


model = als.fit(train_data)


predictions = model.transform(test_data)

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="business_stars",  # ✅ this matches your column
    predictionCol="prediction"
)


rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse}")

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- business_stars: double (nullable = true)

+--------------------+--------------------+--------------+
|             user_id|                name|business_stars|
+--------------------+--------------------+--------------+
|sqkiFAnk4gmL1LYmZ...|Waterfront Gourme...|           4.0|
|7RuSAc-Mslk4aizXX...|Waterfront Gourme...|           4.0|
|xa0aM4h8FZYGHAFZ0...|Waterfront Gourme...|           4.0|
|QAVwfV8qy6meUjIZp...|Waterfront Gourme...|           4.0|
|nmmaBI8t0JN4Hkxay...|Waterfront Gourme...|           4.0|
+--------------------+--------------------+--------------+
only showing top 5 rows

Root-mean-square error = 0.2697022213796745


In [2]:
from pyspark.sql.functions import col, explode

def get_named_recommendations(user_recs_df, ratings_df, top_n=5):
    """
    Converts ALS recommendations to include business names.

    Parameters:
    - user_recs_df: DataFrame from ALS.recommendForAllUsers(n)
    - ratings_df: Original ratings DataFrame with 'itemIndex' and 'name'
    - top_n: Number of recommendations per user

    Returns:
    - DataFrame with columns: userIndex, itemIndex, predicted_rating, name
    """

    # Flatten recommendation array
    exploded = user_recs_df.select(
        "userIndex", explode("recommendations").alias("rec")
    )

    recs_flat = exploded.select(
        "userIndex",
        col("rec.itemIndex").alias("itemIndex"),
        col("rec.rating").alias("predicted_rating")
    )

    # Create lookup table for itemIndex -> name
    item_lookup = ratings_df.select("itemIndex", "name").dropDuplicates(["itemIndex"])

    # Join to get business names
    recs_named = recs_flat.join(item_lookup, on="itemIndex", how="left")

    return recs_named.orderBy("userIndex", col("predicted_rating").desc())


In [4]:
user_recs = model.recommendForAllUsers(10)
recs_named = get_named_recommendations(user_recs, ratings)
recs_named.show(truncate=False)

+---------+---------+----------------+---------------------------------------+
|itemIndex|userIndex|predicted_rating|name                                   |
+---------+---------+----------------+---------------------------------------+
|3879     |0        |5.5633          |Mommy Telly's Famous BBQ               |
|4414     |0        |5.2697215       |Dimitrio & Frida's Food Cart           |
|4981     |0        |5.184526        |Kaku Latin Food                        |
|4810     |0        |5.12578         |Jamils Cafe On Wheels                  |
|4371     |0        |5.1009526       |The Galley                             |
|4626     |0        |5.0736203       |Bravo's Pizza                          |
|4833     |0        |5.035798        |Maplewood Nutrition & Dietary Food Shop|
|4681     |0        |5.0175567       |Luigi's Pizzaria                       |
|5004     |0        |4.997393        |Meatheads Deli                         |
|4800     |0        |4.9683256       |George's Corne