In [1]:
import sys
sys.path.append("../src")

In [2]:
from Utils import *

In [3]:
sc = SparkContext.getOrCreate()
ss = SparkSession(sc)

In [4]:
ratings = ss.read\
            .format("csv")\
            .option('header', 'true')\
            .load(os.path.join("../data","userId_movieId_title_ratings.csv"), inferSchema='true')

ratings.persist()
print("{0:,}".format(ratings.count()))
ratings.show(5)

2,020,165
+--------+------+-------+--------------------+------+------------------+
|ratingId|userId|movieId|               title|rating|   reliable_rating|
+--------+------+-------+--------------------+------+------------------+
|  690318|  4581|      1|    Toy Story (1995)|   3.5|3.4679791510869786|
|  690319|  4581|      2|      Jumanji (1995)|   2.5|2.4771279650621274|
|  690320|  4581|     17|Sense and Sensibi...|   4.0| 3.963404744099404|
|  690321|  4581|     19|Ace Ventura: When...|   3.5|3.4679791510869786|
|  690322|  4581|     32|Twelve Monkeys (a...|   4.0| 3.963404744099404|
+--------+------+-------+--------------------+------+------------------+
only showing top 5 rows



In [5]:
movies = ratings.select(["movieId", "title"]).dropDuplicates()

### ALS model

<img src="../misc/matrix_factorization.png">

In [6]:
als_params = {"userCol":"userId", "itemCol":"movieId",
              "ratingCol":"rating", "coldStartStrategy":"drop"}

In [7]:
ratings.show(5)

+--------+------+-------+--------------------+------+------------------+
|ratingId|userId|movieId|               title|rating|   reliable_rating|
+--------+------+-------+--------------------+------+------------------+
|  690318|  4581|      1|    Toy Story (1995)|   3.5|3.4679791510869786|
|  690319|  4581|      2|      Jumanji (1995)|   2.5|2.4771279650621274|
|  690320|  4581|     17|Sense and Sensibi...|   4.0| 3.963404744099404|
|  690321|  4581|     19|Ace Ventura: When...|   3.5|3.4679791510869786|
|  690322|  4581|     32|Twelve Monkeys (a...|   4.0| 3.963404744099404|
+--------+------+-------+--------------------+------+------------------+
only showing top 5 rows



In [8]:
train, test = ratings.randomSplit([0.9, 0.1])
model = evaluate_ALS(train, test, als_params)

Best model: 14 rank, 0.06 reg param
train RMSE = 0.6832028325789606
test RMSE = 0.7556552849087378


In [9]:
als_params["ratingCol"]="reliable_rating"
reliable_model = evaluate_ALS(train, test, als_params)

Best model: 14 rank, 0.06 reg param
train RMSE = 0.5678966599273602
test RMSE = 0.6221816592808722


In [10]:
model.save("models/bl_als_model")  # LOADING > model = ALSModel.load("models/bl_als_model")
reliable_model.save("models/als_model")

In [11]:
movies.show(5)

+-------+--------------------+
|movieId|               title|
+-------+--------------------+
|   2076|  Blue Velvet (1986)|
|   1490|      B*A*P*S (1997)|
|   2657|Rocky Horror Pict...|
|   4085|Beverly Hills Cop...|
|   6548|  Bad Boys II (2003)|
+-------+--------------------+
only showing top 5 rows



In [12]:
userRecs, movieRecs = get_recommendations(reliable_model, movies)

In [13]:
userRecs.show(5)

+------+-------+--------------------+-----------------+
|userId|movieId|               title|           rating|
+------+-------+--------------------+-----------------+
|    65| 101880|Siberian Educatio...|5.092074394226074|
|    65| 117909|     The Kiss (1900)| 5.02252197265625|
|    65| 106048|Four Days in July...|4.845987319946289|
|    65| 112423|I Belong (Som du ...|4.845987319946289|
|    65| 104803|    Holocaust (1978)|4.796901702880859|
+------+-------+--------------------+-----------------+
only showing top 5 rows



In [14]:
movieRecs.show(5)

+-------+------+----------------+-----------------+
|movieId|userId|           title|           rating|
+-------+------+----------------+-----------------+
|      1| 72714|Toy Story (1995)|4.891761302947998|
|      1|117942|Toy Story (1995)|4.771142959594727|
|      1| 76958|Toy Story (1995)|4.708400249481201|
|      1|  5024|Toy Story (1995)|4.706437587738037|
|      1|  8527|Toy Story (1995)|4.705178260803223|
+-------+------+----------------+-----------------+
only showing top 5 rows

