In [1]:
import os

In [12]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

In [3]:
sc = SparkContext.getOrCreate()
ss = SparkSession(sc)

In [4]:
ratings = ss.read\
                    .format("csv")\
                    .option('header', 'true')\
                    .load(os.path.join("data","userId_movieId_title_ratings.csv"), inferSchema='true')

ratings.persist()
print("{0:,}".format(ratings.count()))
ratings.show(5)

2,020,165
+--------+------+-------+--------------------+------+------------------+
|ratingId|userId|movieId|               title|rating|   reliable_rating|
+--------+------+-------+--------------------+------+------------------+
|  690318|  4581|      1|    Toy Story (1995)|   3.5|3.4679791510869786|
|  690319|  4581|      2|      Jumanji (1995)|   2.5|2.4771279650621274|
|  690320|  4581|     17|Sense and Sensibi...|   4.0| 3.963404744099404|
|  690321|  4581|     19|Ace Ventura: When...|   3.5|3.4679791510869786|
|  690322|  4581|     32|Twelve Monkeys (a...|   4.0| 3.963404744099404|
+--------+------+-------+--------------------+------+------------------+
only showing top 5 rows



### ALS model

<img src="misc/matrix_factorization.png">

In [5]:
def evaluate_ALS(train, test, kwargs):
    # coldStartStrategy parameter to “drop” in order
    # to drop any rows in the DataFrame of predictions that contain NaN values.
    als = ALS(**kwargs)
    param_grid = ParamGridBuilder()\
                    .addGrid(als.rank, list(range(12, 15)))\
                    .addGrid(als.regParam, [i*0.01 for i in range(1, 17, 5)])\
                    .build()
     
    evaluator = RegressionEvaluator(metricName="rmse", labelCol=kwargs["ratingCol"],
                                    predictionCol="prediction")
    tvs = TrainValidationSplit(
        estimator=als,
        estimatorParamMaps=param_grid,
        evaluator=evaluator
    )
    model = tvs.fit(train).bestModel
    
    train_preds = model.transform(train)
    test_preds = model.transform(test)
    print("Best model: {0} rank, {1} reg param".format(model.rank,
                                                       model._java_obj.parent().getRegParam()))
    print("train RMSE = {0}".format(evaluator.evaluate(train_preds)))
    print("test RMSE = {0}".format(evaluator.evaluate(test_preds)))
    
    return model

In [6]:
als_params = {"userCol":"userId", "itemCol":"movieId",
              "ratingCol":"rating", "coldStartStrategy":"drop"}

In [7]:
train, test = ratings.randomSplit([0.9, 0.1])
model = evaluate_ALS(train, test, als_params)

Best model: 14 rank, 0.06 reg param
train RMSE = 0.6833607541885203
test RMSE = 0.7541418796609234


In [8]:
als_params["ratingCol"]="reliable_rating"
reliable_model = evaluate_ALS(train, test, als_params)

Best model: 14 rank, 0.06 reg param
train RMSE = 0.5679290097616302
test RMSE = 0.6198449900878487


In [9]:
model.save("models/bl_als_model")  # LOADING > model = ALSModel.load("models/bl_als_model")
reliable_model.save("models/als_model")

In [None]:
def get_recommendations(model, moviesDf):
    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(10)

    # Generate top 10 user recommendations for each movie
    movieRecs = model.recommendForAllItems(10)

In [27]:
userRecs = userRecs.cache()

In [28]:
userRecs.

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|  7880|[[83829, 7.211159...|
+------+--------------------+
only showing top 1 row

