In [1]:
import os

In [128]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark import Row
from pyspark.sql.functions import desc

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

In [3]:
sc = SparkContext.getOrCreate()
ss = SparkSession(sc)

In [4]:
ratings = ss.read\
                    .format("csv")\
                    .option('header', 'true')\
                    .load(os.path.join("data","userId_movieId_title_ratings.csv"), inferSchema='true')

ratings.persist()
print("{0:,}".format(ratings.count()))
ratings.show(5)

2,020,165
+--------+------+-------+--------------------+------+------------------+
|ratingId|userId|movieId|               title|rating|   reliable_rating|
+--------+------+-------+--------------------+------+------------------+
|  690318|  4581|      1|    Toy Story (1995)|   3.5|3.4679791510869786|
|  690319|  4581|      2|      Jumanji (1995)|   2.5|2.4771279650621274|
|  690320|  4581|     17|Sense and Sensibi...|   4.0| 3.963404744099404|
|  690321|  4581|     19|Ace Ventura: When...|   3.5|3.4679791510869786|
|  690322|  4581|     32|Twelve Monkeys (a...|   4.0| 3.963404744099404|
+--------+------+-------+--------------------+------+------------------+
only showing top 5 rows



In [95]:
movies = ratings.select(["movieId", "title"]).dropDuplicates()

### ALS model

<img src="misc/matrix_factorization.png">

In [5]:
def evaluate_ALS(train, test, kwargs):
    # coldStartStrategy parameter to “drop” in order
    # to drop any rows in the DataFrame of predictions that contain NaN values.
    als = ALS(**kwargs)
    param_grid = ParamGridBuilder()\
                    .addGrid(als.rank, list(range(12, 15)))\
                    .addGrid(als.regParam, [i*0.01 for i in range(1, 17, 5)])\
                    .build()
     
    evaluator = RegressionEvaluator(metricName="rmse", labelCol=kwargs["ratingCol"],
                                    predictionCol="prediction")
    tvs = TrainValidationSplit(
        estimator=als,
        estimatorParamMaps=param_grid,
        evaluator=evaluator
    )
    model = tvs.fit(train).bestModel
    
    train_preds = model.transform(train)
    test_preds = model.transform(test)
    print("Best model: {0} rank, {1} reg param".format(model.rank,
                                                       model._java_obj.parent().getRegParam()))
    print("train RMSE = {0}".format(evaluator.evaluate(train_preds)))
    print("test RMSE = {0}".format(evaluator.evaluate(test_preds)))
    
    return model

In [6]:
als_params = {"userCol":"userId", "itemCol":"movieId",
              "ratingCol":"rating", "coldStartStrategy":"drop"}

In [7]:
train, test = ratings.randomSplit([0.9, 0.1])
model = evaluate_ALS(train, test, als_params)

Best model: 14 rank, 0.06 reg param
train RMSE = 0.6833607541885203
test RMSE = 0.7541418796609234


In [8]:
als_params["ratingCol"]="reliable_rating"
reliable_model = evaluate_ALS(train, test, als_params)

Best model: 14 rank, 0.06 reg param
train RMSE = 0.5679290097616302
test RMSE = 0.6198449900878487


In [9]:
model.save("models/bl_als_model")  # LOADING > model = ALSModel.load("models/bl_als_model")
reliable_model.save("models/als_model")

In [119]:
movies.show(5)

+-------+--------------------+
|movieId|               title|
+-------+--------------------+
|   2076|  Blue Velvet (1986)|
|   1490|      B*A*P*S (1997)|
|   2657|Rocky Horror Pict...|
|   4085|Beverly Hills Cop...|
|   6548|  Bad Boys II (2003)|
+-------+--------------------+
only showing top 5 rows



In [136]:
def get_recommendations(model, movies):
    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(10).rdd.flatMapValues(lambda x: x)\
                                    .map(lambda x: Row(userId=x[0], **x[1].asDict()))\
                                    .toDF()\
                                    .join(movies, on="movieId")\
                                    .select(["userId","movieId", "title","rating"])\
                                    .sort(["userId", desc("rating")])

    # Generate top 10 user recommendations for each movie
    movieRecs = model.recommendForAllItems(10).rdd.flatMapValues(lambda x: x)\
                                    .map(lambda x: Row(movieId=x[0], **x[1].asDict()))\
                                    .toDF()\
                                    .join(movies, on="movieId")\
                                    .select(["movieId", "userId", "title","rating"])\
                                    .sort(["movieId", desc("rating")])
    
    return userRecs, movieRecs

In [137]:
userRecs, movieRecs = get_recommendations(reliable_model, movies)

In [138]:
userRecs.show(5)

+------+-------+--------------------+-----------------+
|userId|movieId|               title|           rating|
+------+-------+--------------------+-----------------+
|    65| 117909|     The Kiss (1900)| 5.06435489654541|
|    65| 104803|    Holocaust (1978)|4.824891567230225|
|    65| 103022|Eu Não Quero Volt...|4.806967735290527|
|    65| 106048|Four Days in July...|4.800962924957275|
|    65| 112423|I Belong (Som du ...|4.800962924957275|
+------+-------+--------------------+-----------------+
only showing top 5 rows



In [139]:
movieRecs.show(5)

+-------+------+----------------+------------------+
|movieId|userId|           title|            rating|
+-------+------+----------------+------------------+
|      1| 72714|Toy Story (1995)| 4.885940074920654|
|      1|117942|Toy Story (1995)|4.8285441398620605|
|      1|  8527|Toy Story (1995)| 4.736027717590332|
|      1| 40617|Toy Story (1995)|4.7006330490112305|
|      1| 53413|Toy Story (1995)|4.6796793937683105|
+-------+------+----------------+------------------+
only showing top 5 rows

