In [10]:
# !hdfs dfs -put data/sample_movie_ratings.txt /sample_movie_ratings.txt

In [26]:
# Code from https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

lines = spark.read.option("header", "true").csv("data/interactions_train.csv").rdd
ratingsRDD = lines.map(lambda p: Row(userId=int(p[0]), recipeId=int(p[1]),
                                     rating=float(p[3])))
ratings = spark.createDataFrame(ratingsRDD)

In [27]:
ratings.show()

+------+--------+------+
|userId|recipeId|rating|
+------+--------+------+
|  2046|    4684|   5.0|
|  2046|     517|   5.0|
|  1773|    7435|   5.0|
|  1773|     278|   4.0|
|  2046|    3431|   5.0|
|  2046|   13307|   5.0|
|  2312|     780|   5.0|
|  2312|   51964|   5.0|
|  2312|    1232|   4.0|
|  2312|    4397|   5.0|
|  2625|     471|   3.0|
|  2312|     164|   5.0|
|  2999|    3567|   5.0|
|  2178|    3704|   3.0|
|  2178|    4366|   5.0|
|  3794|    7508|   4.0|
|  3794|     191|   5.0|
|  3794|    3525|   5.0|
|  2312|    3651|   5.0|
|  2695|     350|   1.0|
+------+--------+------+
only showing top 20 rows



In [46]:
(training, test) = ratings.randomSplit([0.8, 0.2])
best_model = None
best_rmse = 100
for reg in [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0]:
    # Build the recommendation model using ALS on the training data
    # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(maxIter=5, regParam=reg, userCol="userId", itemCol="recipeId", ratingCol="rating",
              coldStartStrategy="drop")
    model = als.fit(training)

    # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print(reg, "Root-mean-square error = " + str(rmse))
    if rmse < best_rmse:
        best_model = model
        best_rmse = rmse

    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(10)
    # Generate top 10 user recommendations for each movie
    #recipeRecs = model.recommendForAllItems(10)
    predictions.show()

0.001 Root-mean-square error = 6.406360808236339
+----------+--------+------+-----------+
|    userId|recipeId|rating| prediction|
+----------+--------+------+-----------+
|    744898|     271|   5.0|  1.9026216|
|    296050|     271|   5.0|  5.3071175|
|     54678|     271|   5.0|-0.44181085|
|    133174|     375|   5.0| -4.0599713|
|    125458|     375|   5.0|  1.7465479|
|     11297|     580|   4.0|  0.3529343|
|   1366128|     580|   4.0|-0.51955426|
|    127113|     683|   5.0|  3.7365239|
|    171423|     897|   0.0|  -1.157497|
|     32772|     897|   5.0| -0.9658788|
|    234222|     897|   5.0| -12.036407|
|    550834|     916|   1.0|   5.607461|
|    388905|     916|   0.0|-0.36894047|
|    813584|     916|   5.0|   2.509241|
|    909166|     916|   0.0| 0.75496614|
|    934824|     916|   5.0|  3.5210094|
|   1319812|     916|   5.0|  7.6934333|
|    579826|     916|   3.0|  1.5847261|
|2001453193|     916|   0.0| 0.28084683|
|     56463|    1133|   5.0|  0.8118108|
+-------