In [23]:
import sys
sys.path.append("../src")

In [24]:
from Utils import *

In [25]:
sc = SparkContext.getOrCreate()
ss = SparkSession(sc)

In [26]:
def sample_data(df, movies_count=5000, users_count=1000):
    return df.select("movieId").dropDuplicates().limit(movies_count)\
            .join(df, on="movieId").join(df.select("userId").dropDuplicates().limit(users_count), on="userId")
            

In [27]:
ratings = ss.read\
            .format("csv")\
            .option('header', 'true')\
            .load(os.path.join("../data","userId_movieId_title_ratings.csv"), inferSchema='true')
    
ratings = sample_data(ratings)
ratings.persist()
print("Users: {0:,}".format(get_count(ratings,"userId")))
print("Movies: {0:,}".format(get_count(ratings,"movieId")))
print("Ratings: {0:,}".format(ratings.count()))
ratings.show(5)

Users: 1,000
Movies: 3,306
Ratings: 80,453
+------+-------+--------+--------------------+--------------------+------+------------------+
|userId|movieId|ratingId|               title|              genres|rating|   reliable_rating|
+------+-------+--------+--------------------+--------------------+------+------------------+
|  5936|    111|  887646|  Taxi Driver (1976)|Crime|Drama|Thriller|   4.5|3.7691303537482317|
|  5936|    223|  887648|       Clerks (1994)|              Comedy|   4.0|3.3503380922206505|
|  5936|    296|  887650| Pulp Fiction (1994)|Comedy|Crime|Dram...|   5.0| 4.187922615275813|
|  5936|    471|  887655|Hudsucker Proxy, ...|              Comedy|   4.5|3.7691303537482317|
|  5936|    858|  887660|Godfather, The (1...|         Crime|Drama|   4.0|3.3503380922206505|
+------+-------+--------+--------------------+--------------------+------+------------------+
only showing top 5 rows



In [31]:
sum_ratings_per_item = lambda item: ratings.select([item, "rating", "reliable_rating"])\
                                               .groupBy(item).max()\
                                               .select(
    [item, col("max(rating)").alias("#rating"),col("max(reliable_rating)").alias("#reliable_rating")])

sum_ratings_per_movie = sum_ratings_per_item("movieId")
print("{0:,} movies".format(sum_ratings_per_movie.count()))
sum_ratings_per_movie.show()

3,306 movies
+-------+-------+------------------+
|movieId|#rating|  #reliable_rating|
+-------+-------+------------------+
|    471|    5.0| 4.971667471256922|
|  31528|    4.5|4.4840060064508505|
|   4900|    5.0| 4.962749463706647|
|   1580|    5.0| 4.986457469239084|
|  44022|    5.0| 4.853260910176726|
|   8638|    5.0| 4.983877537450501|
|  68135|    4.5| 4.093356760815128|
|   1645|    5.0|  4.98749736361837|
|   6620|    5.0| 4.987562612014458|
|   1591|    5.0| 4.045173444033664|
|   3794|    4.0|3.9848537060329736|
|   4519|    5.0| 4.389483628043458|
|   5300|    4.5| 3.988684011538107|
|  54190|    5.0| 4.981067132541217|
|  36525|    5.0| 4.986117032403464|
|   3175|    5.0|  4.98749736361837|
|   1088|    5.0| 4.981067132541217|
|  96488|    5.0| 4.904379816245556|
|   5803|    4.0| 3.969224519756879|
|   3997|    4.0|3.4451974187633945|
+-------+-------+------------------+
only showing top 20 rows



In [32]:
sum_ratings_per_user = sum_ratings_per_item("userId")
print("{0:,} users".format(sum_ratings_per_user.count()))
sum_ratings_per_user.show()

1,000 users
+------+-------+------------------+
|userId|#rating|  #reliable_rating|
+------+-------+------------------+
|  4935|    5.0|2.7136193837258067|
|  7880|    5.0| 4.164683584426196|
|  7993|    5.0| 4.229870885587098|
| 10817|    5.0|3.5244141738300354|
| 13285|    5.0|3.9029876146221727|
| 23364|    5.0|3.8724385431988533|
| 23571|    5.0| 4.857474084534058|
| 25591|    5.0| 4.274397466145984|
| 34234|    5.0| 3.373803231428388|
| 34239|    5.0| 4.980996271352866|
| 36224|    5.0|3.6712075078887816|
| 36538|    5.0| 4.916121796121436|
| 40335|    5.0|  4.98426285678417|
| 45011|    5.0| 4.889117547317986|
| 65867|    5.0|   3.2552881540007|
| 73683|    5.0|3.7494290887090465|
| 83693|    4.0| 3.813770605093409|
| 97092|    4.5|4.3566009363481255|
|101055|    5.0| 4.505570816450706|
|102594|    5.0|3.3354571051185906|
+------+-------+------------------+
only showing top 20 rows



In [33]:
u_ratings = sc.broadcast({r["userId"]: (r["#rating"], r["#reliable_rating"]) for r in sum_ratings_per_user.collect()})
i_ratings = sc.broadcast({r["movieId"]: (r["#rating"], r["#reliable_rating"]) for r in sum_ratings_per_movie.collect()})

In [34]:
# probs = ratings.select(["userId", "movieId", "rating", "reliable_rating"]).rdd\
probs = ratings.select(["userId", "movieId", "reliable_rating"]).rdd\
                .map(lambda x: Row(userId=x["userId"], movieId=x["movieId"],
                                   
#                         p_u_i_r=x["rating"]/i_ratings.value[x["movieId"]][0],
                        p_u_i=x["reliable_rating"]/i_ratings.value[x["movieId"]][1],
                                   
#                         p_i_u_r=x["rating"]/u_ratings.value[x["userId"]][0],
                        p_i_u=x["reliable_rating"]/u_ratings.value[x["userId"]][1]))\
                .toDF()
probs.show(5)

+-------+-----+------------------+------+
|movieId|p_i_u|             p_u_i|userId|
+-------+-----+------------------+------+
|    111|  0.9|0.7558908881866953|  5936|
|    223|  0.8|0.6717385530459459|  5936|
|    296|  1.0|0.8396731913074322|  5936|
|    471|  0.9|0.7581219732693287|  5936|
|    858|  0.8|0.6719428805596535|  5936|
+-------+-----+------------------+------+
only showing top 5 rows



In [35]:
del sum_ratings_per_user, sum_ratings_per_movie, u_ratings, i_ratings

$$p(e_a|u_i)=\frac{\text{# posts by }u_i\text{ in } e_a}{\text{# posts by } u_i}$$

<br>

$$p(u_i|e_a)=\frac{\text{# posts by }u_i\text{ in } e_a}{\text{# posts in } e_a}$$

<br>

$$\textbf{MLE Similarity}=p(u_i|u_j) = 1-\prod_{\forall u_i, u_j \in users, \forall e \in elements} (1-
p(e|u_j)p(u_i|e))$$

In [36]:
def users_mle_similarities_per_movie(probs):
    """
    probs is an iterable [( p_i_u, p_u_i, userId)] where i is a movie m_i
    """
#     """
#     probs is an iterable [( p_i_u_r, p_i_u_relr, p_u_i_r, p_u_i_relr, userId)] where i is a movie m_i
#     """
    d = dict()  # {(ui, uj): (1-(p(i_ui) * p(uj_i)), repeat for 'reliable' probs)}
    for i, j in product(probs, probs):
        ui, uj = i[-1], j[-1]
        if ui == uj: continue
#         p_i_ui, p_i_ui_rel = i[0], i[1]
#         p_uj_i, p_uj_i_rel = j[2], j[3]
        p_i_ui = i[0]
        p_uj_i = j[1]
#         d[(ui, uj)] = (1 - p_i_ui*p_uj_i, 1 - p_i_ui_rel*p_uj_i_rel)
        d[(ui, uj)] = 1 - p_i_ui*p_uj_i
    return list(d.items())

In [37]:
# users_similar_to_ui = probs.rdd.map(lambda r: (r["movieId"],
#                          (r["p_i_u_r"], r["p_i_u_relr"], r["p_u_i_r"], r["p_u_i_relr"], r["userId"])))\
ml_similarities = probs.rdd.map(lambda r: (r["movieId"],
                         (r["p_i_u"], r["p_u_i"], r["userId"])))\
        .groupByKey()\
        .mapValues(lambda x: users_mle_similarities_per_movie(x))\
        .flatMapValues(lambda x: x)\
        .map(lambda x: x[1])\
        .reduceByKey(lambda x1, x2: x1 * x2)\
        .map(lambda r: Row(ui=r[0][0], uj=r[0][1], sim=1-r[1]))\
        .toDF()\
        .sort(desc("sim"))\

#         .reduceByKey(lambda x1, x2: (x1[0] * x2[0], x1[1]*x2[1]))\
#         .map(lambda r: Row(ui=r[0][0], uj=r[0][1], sim=1-r[1][0], sim_rel=1-r[1][1]))\
ml_similarities.write.mode('overwrite').csv("../data/similarities.csv", header=True)
ml_similarities.show(5)
del ml_similarities

+---+-----+------+
|sim|   ui|    uj|
+---+-----+------+
|1.0|13665| 59914|
|1.0|13665| 57218|
|1.0| 5936| 68905|
|1.0|13665|138186|
|1.0|13665| 70346|
+---+-----+------+
only showing top 5 rows



In [38]:
def users_W_similarities_per_movie(rates, max_rating=5.):
    """
    rates is a list [(ui, rating) for every user rating that movie]
    """
    d = dict()
    for (u1, r1), (u2, r2) in product(rates, rates):
        if u1 == u2: continue
        d[(u1, u2)] = (abs(r1 - r2), max_rating)
    return list(d.items())

$$W^{sim}(u, v) = 1-\sum_{m\in {M_u \cap M_v}}{\frac{{abs(r_u(m) - r_v(m))}}{R}}$$

$R$ is the maximum rating (i.e 5.0)

In [39]:
weighted_similarities = ratings.rdd.map(lambda r: (r["movieId"], (r["userId"], r["reliable_rating"])))\
                                    .groupByKey()\
                                    .mapValues(lambda x: users_W_similarities_per_movie(x))\
                                    .flatMapValues(lambda x:x)\
                                    .map(lambda x:x[1])\
                                    .reduceByKey(lambda x1, x2: (x1[0]+x2[0], x1[1]+x2[1]))\
                                    .map(lambda x: Row(ui=x[0][0], uj=x[0][1], sim=1-float(x[1][0])/x[1][1]))\
                                    .toDF()\
                                    .sort(desc("sim"))
weighted_similarities.write.mode('overwrite').csv("../data/weighted_similarities.csv", header=True)
weighted_similarities.show(5)

+------------------+------+------+
|               sim|    ui|    uj|
+------------------+------+------+
|0.9999943499883635|104397| 24994|
|0.9999943499883635| 24994|104397|
|0.9999937846416633| 40335| 12373|
|0.9999937846416633| 12373| 40335|
|0.9999927547972378|  6108| 75808|
+------------------+------+------+
only showing top 5 rows



### Get recommendations for a user

In [40]:
ui = 48838
user_recs = weighted_similarities.filter(weighted_similarities.ui == ui)\
                                        .join(ratings, weighted_similarities.uj==ratings.userId)\
                                        .select([col("ui").alias("userId"), "movieId", "title",
                                                 col("reliable_rating").alias("rating"), "sim"])

user_recs = user_recs.withColumn("pred", user_recs.rating*user_recs.sim)\
                                        .groupBy(["userId", "movieId"]).avg("pred")\
                                        .select(["userId", "movieId", col("avg(pred)").alias("pred")])\


user_recs = user_recs.join(ratings, on=["userId", "movieId"])\
                                        .select(["userId", "movieId", "title",
                                                 col("reliable_rating").alias("rating"), "pred"])
user_recs = user_recs.withColumn("error", (user_recs.rating-user_recs.pred)**2).sort(desc("pred"))
user_recs.show(5)

+------+-------+--------------------+------------------+------------------+--------------------+
|userId|movieId|               title|            rating|              pred|               error|
+------+-------+--------------------+------------------+------------------+--------------------+
| 48838|    296| Pulp Fiction (1994)| 2.727917135391917|2.7763397347457524|0.002344748128182062|
| 48838|   8015|Phantom Tollbooth...|3.5073220312181785| 2.717367809991806|  0.6240276716333648|
| 48838|   1197|Princess Bride, T...|  3.89702447913131|2.7035503156823006|   1.424380578820312|
| 48838|   2542|Lock, Stock & Two...|  3.89702447913131| 2.684786111663989|   1.469521859559835|
| 48838|   1210|Star Wars: Episod...|  3.89702447913131| 2.666324690276045|   1.514621970288393|
+------+-------+--------------------+------------------+------------------+--------------------+
only showing top 5 rows



In [41]:
def get_user_recs_error(ui, similarities, recs_size=10):
    user_recs = similarities.filter(similarities.ui == ui)\
                                            .join(ratings, similarities.uj==ratings.userId)\
                                            .select([col("ui").alias("userId"), "movieId", "title",
                                                     "genres", col("reliable_rating").alias("rating"), "sim"])

    user_recs = user_recs.withColumn("pred", user_recs.rating*user_recs.sim)\
                                            .groupBy(["userId", "movieId", "title", "genres"]).avg("pred")\
                                            .sort(desc("avg(pred)"))\
                                            .select(["userId", "movieId", "title", "genres",
                                                     col("avg(pred)").alias("pred")])

    errors = user_recs.join(ratings, on=["userId", "movieId"])\
                                            .select([col("reliable_rating").alias("rating"), "pred"])
    errors = errors.withColumn("error", (errors.rating-errors.pred)**2)
    
    error = errors.rdd.map(lambda x: x["error"]).reduce(lambda x1, x2: x1+x2)
    return error, user_recs.limit(recs_size), errors.count()

In [42]:
def get_recommendations(users, similarities, recs_size=10):
    error = 0
    recs = None
    count = 0
    for u in users:
        print("Predicting for user: {0}".format(u))
        e, r, c = get_user_recs_error(u, similarities, recs_size)
        recs = recs.union(r) if recs else r
        error += e
        count += c
        print("MSE: ", round(e/c, 3))
        print()
    error /= count
    return recs, error**0.5

In [43]:
users = ratings.select(["userId"]).dropDuplicates().rdd.map(lambda r: r[0]).collect()[:10]
recs, error = get_recommendations(users, weighted_similarities, recs_size=10)
print("RMSE: ", error)
recs = recs.cache()
recs.show()

Predicting for user: 4935
MSE:  0.337

Predicting for user: 7880
MSE:  1.011

Predicting for user: 7993
MSE:  1.157

Predicting for user: 10817
MSE:  0.402

Predicting for user: 13285
MSE:  0.481

Predicting for user: 23364
MSE:  0.734

Predicting for user: 23571
MSE:  1.432

Predicting for user: 25591
MSE:  0.952

Predicting for user: 34234
MSE:  0.334

Predicting for user: 34239
MSE:  3.659

RMSE:  0.9341897029596626
+------+-------+--------------------+--------------------+------------------+
|userId|movieId|               title|              genres|              pred|
+------+-------+--------------------+--------------------+------------------+
|  4935|   8025|Thief, The (Vor) ...|               Drama| 4.225135465870284|
|  4935| 107627|Physician, The (2...|     Adventure|Drama|3.6975812898193454|
|  4935|  46083|Drawing Restraint...|             Fantasy| 3.691640469826862|
|  4935| 118338|Hard to Be a God ...|              Sci-Fi|3.6095736372850182|
|  4935| 101862|50 Children: Th

In [44]:
del weighted_similarities

In [45]:
ml_similarities = ss.read\
            .format("csv")\
            .option('header', 'true')\
            .load(os.path.join("../data","similarities.csv"), inferSchema='true')

In [46]:
ml_similarities.show()

+---+-----+------+
|sim|   ui|    uj|
+---+-----+------+
|1.0| 5936| 52462|
|1.0|13665| 48909|
|1.0|13665| 60005|
|1.0|13665| 57053|
|1.0|13665|119197|
|1.0|28599|130987|
|1.0|28599| 93547|
|1.0|28599|128323|
|1.0|28599| 22963|
|1.0|28599| 80987|
|1.0|28599|108403|
|1.0|38347| 81487|
|1.0|38347|121535|
|1.0|38347| 87135|
|1.0|38347| 99871|
|1.0|46880|  2062|
|1.0|46880| 56342|
|1.0|46880| 60398|
|1.0|46880| 79734|
|1.0|46880| 39766|
+---+-----+------+
only showing top 20 rows



In [47]:
users = ratings.select(["userId"]).dropDuplicates().rdd.map(lambda r: r[0]).collect()[:10]
mle_recs, error = get_recommendations(users, ml_similarities, recs_size=10)
print("RMSE: ", error)
mle_recs.show()

Predicting for user: 4935
MSE:  1.313

Predicting for user: 7880
MSE:  0.503

Predicting for user: 7993
MSE:  0.471

Predicting for user: 10817
MSE:  0.215

Predicting for user: 13285
MSE:  0.561

Predicting for user: 23364
MSE:  0.636

Predicting for user: 23571
MSE:  1.099

Predicting for user: 25591
MSE:  0.538

Predicting for user: 34234
MSE:  0.096

Predicting for user: 34239
MSE:  1.941

RMSE:  0.8151495854796782
+------+-------+--------------------+--------------------+------------------+
|userId|movieId|               title|              genres|              pred|
+------+-------+--------------------+--------------------+------------------+
|  4935| 100617|Patton Oswalt: No...|              Comedy| 4.981067128142341|
|  4935| 120815|Patton Oswalt: We...|              Comedy| 4.981067128142341|
|  4935|    406| Federal Hill (1994)|               Drama| 4.976210295183103|
|  4935|  71433|Black God, White ...|Adventure|Crime|D...|  4.87340685641366|
|  4935| 118338|Hard to Be a Go

### Notice that RMSE is not suitable for evaluating this approach, because predicted ratings are scaled by users similarities, which are typically very small. What matters is the rank of the prediction.

In [48]:
u1, u2 = recs.select(["userId"]).dropDuplicates().rdd.collect()[:2]
u1, u2 = u1[0], u2[0]

In [49]:
# .withColumn("rank", F.dense_rank().over(Window.partitionBy("userId").orderBy(desc("reliable_rating"))))
ratings.filter(ratings.userId==u1).sort(desc("reliable_rating")).show()

+------+-------+--------+--------------------+--------------------+------+------------------+
|userId|movieId|ratingId|               title|              genres|rating|   reliable_rating|
+------+-------+--------+--------------------+--------------------+------+------------------+
|  4935|   4973|  738692|Amelie (Fabuleux ...|      Comedy|Romance|   5.0|2.7136193837258067|
|  4935|   2791|  738683|    Airplane! (1980)|              Comedy|   5.0|2.7136193837258067|
|  4935|   2502|  738678| Office Space (1999)|        Comedy|Crime|   5.0|2.7136193837258067|
|  4935|   1288|  738661|This Is Spinal Ta...|              Comedy|   4.5| 2.442257445353226|
|  4935|   8464|  738705|Super Size Me (2004)|Comedy|Documentar...|   4.5| 2.442257445353226|
|  4935|  93855|  738737|God Bless America...|        Comedy|Drama|   4.0|2.1708955069806453|
|  4935|   3421|  738684| Animal House (1978)|              Comedy|   4.0|2.1708955069806453|
|  4935|   1274|  738660|        Akira (1988)|Action|Adventu

In [50]:
ratings.filter(ratings.userId==u2).sort(desc("reliable_rating")).show()

+------+-------+----------+--------------------+--------------------+------+-----------------+
|userId|movieId|  ratingId|               title|              genres|rating|  reliable_rating|
+------+-------+----------+--------------------+--------------------+------+-----------------+
| 23571|    296|8590793923| Pulp Fiction (1994)|Comedy|Crime|Dram...|   5.0|4.857474084534058|
| 23571|   1466|8590794051|Donnie Brasco (1997)|         Crime|Drama|   4.5|4.371726676080652|
| 23571|   2231|8590794131|     Rounders (1998)|               Drama|   4.5|4.371726676080652|
| 23571|    111|8590793897|  Taxi Driver (1976)|Crime|Drama|Thriller|   4.5|4.371726676080652|
| 23571|  49530|8590794497|Blood Diamond (2006)|Action|Adventure|...|   4.5|4.371726676080652|
| 23571|    593|8590793966|Silence of the La...|Crime|Horror|Thri...|   4.5|4.371726676080652|
| 23571|  86377|8590794602|Louis C.K.: Shame...|              Comedy|   4.5|4.371726676080652|
| 23571|   1265|8590794033|Groundhog Day (1993)|Co