In [1]:
sc

In [10]:
import pandas as pd
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import *



In [1]:
#lines = spark.read.csv("hdfs:/user/data/ratings.csv", header="true",inferSchema="true").rdd
#lines = spark.read.csv("gs://dataset-rs/ml-20m/ratings.csv", header="true",inferSchema="true").rdd
lines = spark.read.csv("/home/aleja/Documentos/datasets/ml-20m/ratings.csv", header="true",inferSchema="true").rdd

lines.take(2)

[Row(userId=1, movieId=2, rating=3.5, timestamp=1112486027),
 Row(userId=1, movieId=29, rating=3.5, timestamp=1112484676)]

In [2]:

df_ratings = spark.createDataFrame(lines) 

df_ratings

DataFrame[userId: bigint, movieId: bigint, rating: double, timestamp: bigint]

In [3]:
df_ratings.rdd.getNumPartitions()

4

In [4]:
#newdf = ratings.limit(20000)
ratings = df_ratings.sample(False,fraction=0.5, seed=1)
ratings.select('userId').count()

9998448

In [5]:
ratings = ratings.repartition(1)
ratings.rdd.getNumPartitions()

1

In [6]:
ratings.select('userId').distinct().count()

138493

In [7]:
ratings.select('movieId').distinct().count()

24066

In [8]:
(training, test) = ratings.randomSplit([0.8, 0.2])
training.printSchema()


root
 |-- userId: long (nullable = true)
 |-- movieId: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: long (nullable = true)



In [16]:
training.show(2)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      2|   3.5|1112486027|
|     1|     32|   3.5|1112484819|
+------+-------+------+----------+
only showing top 2 rows



In [17]:
training.filter(training['rating'] > 4).show(10)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|   4993|   5.0|1112484682|
|     1|   7153|   5.0|1112484633|
|     1|   8507|   5.0|1094786027|
|     2|     62|   5.0| 974820598|
|     2|     70|   5.0| 974820691|
|     2|    480|   5.0| 974820720|
|     2|    541|   5.0| 974821014|
|     2|    589|   5.0| 974820658|
|     2|    924|   5.0| 974821014|
|     2|   1214|   5.0| 974821014|
+------+-------+------+----------+
only showing top 10 rows



ALS
-numBlocks is the number of blocks the users and items will be partitioned into in order to parallelize computation (defaults to 10).
-rank is the number of latent factors in the model (defaults to 10).
-maxIter is the maximum number of iterations to run (defaults to 10).
-regParam specifies the regularization parameter in ALS (defaults to 1.0).
-implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data (defaults to false which means using explicit feedback).
-alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations (defaults to 1.0).
-nonnegative specifies whether or not to use nonnegative constraints for least squares (defaults to false).


In [11]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop")


In [12]:
model = als.fit(training)
model.rank

10

In [20]:
#para usar despues
predictions_Data= test.select(test.columns[:2])

In [21]:
predictions_Data = sorted(predictions_Data.collect(), key=lambda r:r[0])

In [22]:
predictions_Data[0]

Row(userId=1, movieId=253)

In [13]:
predictions = model.transform(test)

In [14]:
predictions.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
| 97435|    148|   4.0|1042483722| 2.8819597|
| 88527|    148|   2.0| 965659724| 2.1215599|
| 91782|    148|   3.0| 846406692| 2.7529137|
| 36445|    148|   4.5|1419358555| 2.2832372|
| 20344|    148|   2.0| 965940170| 3.0843344|
| 68242|    148|   3.0|1047397251|  2.868523|
|108141|    148|   3.0| 837773116| 2.3494816|
| 28361|    148|   4.0| 828873686| 3.9624681|
| 80886|    148|   2.0| 944246202| 3.1821542|
|107802|    148|   3.0| 834487594| 3.0518832|
| 22584|    148|   2.0| 835094487| 3.1753657|
| 85166|    148|   3.0| 944274839| 2.9969263|
|  5186|    148|   2.0| 962906606| 2.5781078|
| 67698|    148|   3.0| 945124706| 3.4344025|
|  5814|    148|   3.0| 859547410| 3.1405952|
| 12539|    148|   3.0| 956789580|  3.057478|
| 80952|    148|   3.0| 833159835|  2.866095|
| 28478|    148|   5.0| 836529818|   4.15996|
|136182|    148|   4.0| 963673787|

In [15]:
# Evaluate the model by computing the RMSE on the test data


evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions.na.drop())
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.86619277371824


In [16]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show(truncate=False)

+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                                     |
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|148   |[[33110,10.416933], [82759,10.416395], [81443,10.244753], [103721,10.024731], [92122,9.99485], [65667,9.954976], [58898,9.8829], [97779,9.841513], [5280,9.768883], [52104,9.751188]]               |
|463   |[[69858,10.931751], [94951,9.545645], [81443,9.342561], [82051,9.308655], [73529,9.19513], [87065,8.934564], [87040,8.456299], [101862,8.43072], [91632,8.430481], [6220

In [17]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.show(truncate=False)

+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|movieId|recommendations                                                                                                                                                                                    |
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1580   |[[40343,6.001258], [70175,5.901202], [101476,5.80225], [96967,5.802183], [122237,5.7978315], [91209,5.787822], [67233,5.741064], [24590,5.7010384], [101259,5.6936216], [53993,5.6929655]]         |
|4900   |[[39244,10.003544], [93965,8.550216], [24829,8.339062], [41266,8.324963], [6949,8.277994], [122237,8.192054], [61315,8.164588], [124294,8.164522], [90503,8.013707], [9

In [18]:
movies_data = spark.read.csv("/home/aleja/Documentos/datasets/ml-20m/movies.csv", header="true",inferSchema="true").rdd
df_movies = spark.createDataFrame(movies_data)
df_movies

DataFrame[movieId: bigint, title: string, genres: string]

In [19]:
df_movies.select('movieId').count()

27278

In [20]:
movies = df_movies.sample(False,fraction=0.5, seed=1)
movies.select('movieId').count()

13710

In [21]:
movies.show(truncate=False)

+-------+----------------------------------------------------+-------------------------------------------+
|movieId|title                                               |genres                                     |
+-------+----------------------------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                                    |Adventure|Animation|Children|Comedy|Fantasy|
|3      |Grumpier Old Men (1995)                             |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)                            |Comedy|Drama|Romance                       |
|7      |Sabrina (1995)                                      |Comedy|Romance                             |
|9      |Sudden Death (1995)                                 |Action                                     |
|12     |Dracula: Dead and Loving It (1995)                  |Comedy|Horror                              |
|13     |Balto (1995)                

In [22]:
movies = movies.repartition(1)
movies.rdd.getNumPartitions()

1

In [23]:
data = ratings.select("movieId").distinct().withColumn("userId", lit(148))
data.show()


+-------+------+
|movieId|userId|
+-------+------+
|   2529|   148|
|    474|   148|
|  45726|   148|
|     29|   148|
|  60756|   148|
|   1950|   148|
| 106002|   148|
| 106100|   148|
|    964|   148|
|   2927|   148|
|   3091|   148|
|   1806|   148|
|   1677|   148|
|   3764|   148|
|     26|   148|
|   5385|   148|
|  51709|   148|
|  96829|   148|
|  51418|   148|
|   2453|   148|
+-------+------+
only showing top 20 rows



In [24]:
datamv = ratings.filter(ratings.userId == 148).select("movieId", "userId")
datamv.show()

+-------+------+
|movieId|userId|
+-------+------+
|     17|   148|
|     18|   148|
|     39|   148|
|     46|   148|
|     86|   148|
|    222|   148|
|    224|   148|
|    252|   148|
|    342|   148|
|    353|   148|
|    356|   148|
|    362|   148|
|    468|   148|
|    597|   148|
|    708|   148|
|    902|   148|
|    914|   148|
|    916|   148|
|    933|   148|
|   1057|   148|
+-------+------+
only showing top 20 rows



In [25]:
data_pred = model.transform(data.subtract(datamv)).dropna().orderBy("prediction",ascending=False).limit(5).select("movieId", "prediction")
data_pred.show()

+-------+----------+
|movieId|prediction|
+-------+----------+
|  33110| 10.416933|
|  82759| 10.416395|
|  81443| 10.244753|
| 103721| 10.024731|
|  92122|   9.99485|
+-------+----------+



In [26]:
data_pred

DataFrame[movieId: bigint, prediction: float]

In [27]:
movies

DataFrame[movieId: bigint, title: string, genres: string]

In [28]:
rec_mv = data_pred.join(movies, data_pred.movieId == movies.movieId)
#select(data_pred.movieId, movies.title, data_pred.prediction)
rec_mv.show()

+-------+----------+-------+--------------------+------------+
|movieId|prediction|movieId|               title|      genres|
+-------+----------+-------+--------------------+------------+
|  81443| 10.244753|  81443|Counsellor at Law...|       Drama|
| 103721| 10.024731| 103721|         Love (2011)|Drama|Sci-Fi|
+-------+----------+-------+--------------------+------------+



In [29]:
from pyspark.sql.functions import lit



def recommendMovies(model, user, nbRecommendations):
     # Create a Spark DataFrame with the specified user and all the movies listed in the ratings DataFrame
    dataSet = ratings.select("movieId").distinct().withColumn("userId", lit(user))

    # Create a Spark DataFrame with the movies that have already been rated by this user
    moviesAlreadyRated = ratings.filter(ratings.userId == user).select("movieId", "userId")

    # Apply the recommender system to the data set without the already rated movies to predict ratings
    predictions = model.transform(dataSet.subtract(moviesAlreadyRated)).dropna().orderBy("prediction", ascending=False).limit(nbRecommendations).select("movieId", "prediction")
    
    # Join with the movies DataFrame to get the movies titles and genres
    recommendations = predictions.join(movies, predictions.movieId == movies.movieId).select(predictions.movieId, movies.title, movies.genres, predictions.prediction)

    recommendations.show(truncate=False)

In [30]:
print ("Recommendations for user 148:")
recommendMovies(model,148,10)

Recommendations for user 148:
+-------+------------------------------+------------------------------------------+----------+
|movieId|title                         |genres                                    |prediction|
+-------+------------------------------+------------------------------------------+----------+
|5280   |Salmonberries (1991)          |Drama                                     |9.768883  |
|58898  |Aerial, The (La antena) (2007)|Adventure|Fantasy|Film-Noir|Mystery|Sci-Fi|9.8829    |
|81443  |Counsellor at Law (1933)      |Drama                                     |10.244753 |
|103721 |Love (2011)                   |Drama|Sci-Fi                              |10.024731 |
+-------+------------------------------+------------------------------------------+----------+



Now we can use the various prediction functions on the model variable.

*predict*: - Return a single floating point value
*predictAll*: -Returns RDD of Rating Objects
*recommendUsers*: -Returns a List of Ratings in Descending Order by Rating
*recommendProducts*: -Returns a List of Ratings
*recommendProductsForUsers*:-Returns RDD with(UserID, (RatingObj, RatingObj, …) ) where RatingObj is sorted descending by rating
*recommendUsersforProducts*:-Returns RDD with(ProductID, (RatingObj, RatingObj, …) ) where RatingObj is sorted descending by rating
