In [1]:
# !hdfs dfs -put data/sample_movie_ratings.txt /sample_movie_ratings.txt

In [2]:
# Code from https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

lines = spark.read.option("header", "true").csv("data/interactions_train.csv").rdd
ratingsRDD = lines.map(lambda p: Row(userId=int(p[0]), recipeId=int(p[1]),
                                     rating=float(p[3])))
ratings = spark.createDataFrame(ratingsRDD)

In [3]:
(unnorm_training, unnorm_test) = ratings.randomSplit([0.8, 0.2])
mean = unnorm_training.agg({'rating': 'mean'}).collect()[0][0]
std = unnorm_training.agg({'rating': 'std'}).collect()[0][0]
print(mean, std)
training = unnorm_training.withColumn("rating", (col("rating") - mean) / std)
test = unnorm_test.withColumn("rating", (col("rating") - mean) / std)

4.574004675881289 0.9598328873474449


In [5]:
best_model = None
best_rmse = 100
for reg in [0.05, 0.075, 0.1, 0.125]:
    # Build the recommendation model using ALS on the training data
    # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(rank=200, maxIter=20, regParam=reg, userCol="userId", itemCol="recipeId", ratingCol="rating",
              coldStartStrategy="drop")
    model = als.fit(training)

    # Evaluate the model by computing the RMSE on the test data
    normalized_predictions = model.transform(test)
    predictions = normalized_predictions.withColumn(
        "rating",col("rating") * std + mean
    ).withColumn(
        "prediction",col("prediction") * std + mean
    )
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")

    rmse = evaluator.evaluate(predictions)
    print(alpha, reg, rmse)
    if rmse < best_rmse:
        #print(f"Best model -> rank:{rank}, reg:{reg}, rmse:{rmse}")
        predictions.show()
        best_model = model
        best_rmse = rmse

    # Generate top 10 movie recommendations for each user
    #userRecs = model.recommendForAllUsers(10)
    # Generate top 10 user recommendations for each movie
    #recipeRecs = model.recommendForAllItems(10)

0.5 0.05 0.9398634118962444
+------+--------+------+------------------+
|userId|recipeId|rating|        prediction|
+------+--------+------+------------------+
|273254|      85|   5.0| 4.627255149299202|
|209255|     243|   5.0| 4.593806207282826|
| 54678|     271|   5.0|  4.68052124118315|
|107504|     271|   5.0| 4.550919640897223|
|296050|     271|   5.0| 4.761063281983651|
|111075|     271|   5.0|  4.61544770339828|
|133174|     375|   5.0| 4.665852312091676|
|  2625|     471|   3.0|  4.43284436485057|
|  6164|     593|   5.0| 4.100138299612664|
|364211|     593|   5.0| 4.531807864292713|
|599450|     626|   5.0| 4.777289366417691|
| 87023|     642|   5.0| 4.660708058506069|
|297913|     642|   3.0| 4.579805231318959|
| 34146|     642|   5.0| 4.438001562311417|
|232669|     673|   4.0|  4.61572467014749|
| 32772|     897|   5.0|   4.5604803581231|
|  4470|     897|   5.0|  4.50477812790688|
|  5060|     897|   5.0| 4.518532915475705|
|540346|     916|   5.0| 4.727687807186224|
| 12

In [None]:
best_model.rank