In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [6]:
spark = SparkSession.builder.appName('Recommender system').getOrCreate()

24/02/04 12:17:00 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [7]:
data = spark.read.csv('data/ML/recommender_system/movielens_ratings.csv', header=True, inferSchema=True)

In [8]:
data.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
|     23|   1.0|     0|
|     26|   3.0|     0|
|     27|   1.0|     0|
|     28|   1.0|     0|
|     29|   1.0|     0|
|     30|   1.0|     0|
|     31|   1.0|     0|
|     34|   1.0|     0|
|     37|   1.0|     0|
|     41|   2.0|     0|
+-------+------+------+


In [9]:
data.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)


In [10]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+


In [15]:
train_data, test_data = data.randomSplit([0.80, 0.20])

In [16]:
test_data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|               318|               318|               318|
|   mean|48.930817610062896|1.8584905660377358|13.720125786163521|
| stddev| 29.12198200539353|1.2364496791454895| 8.574058739295234|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+


In [38]:
als = ALS(maxIter=10, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")

In [39]:
model = als.fit(train_data)

In [40]:
predictions = model.transform(test_data)

In [41]:
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|      0|   3.0|    28|  1.2109452|
|      2|   1.0|    26|  1.0189596|
|      4|   4.0|    26| 0.96820885|
|      4|   1.0|    12|  0.7779279|
|      2|   2.0|     1| 0.51243174|
|      3|   1.0|     1|   1.525861|
|      5|   1.0|    13|  1.7642727|
|      1|   1.0|     3|  2.6784935|
|      2|   2.0|    20| 0.26189977|
|      4|   2.0|    20|  1.9301673|
|      0|   1.0|    19| 0.28556177|
|      0|   1.0|    15|-0.40665403|
|      1|   4.0|    15|-0.46970066|
|      2|   1.0|    15|  1.1296862|
|      2|   4.0|     8|  2.0162854|
|      4|   2.0|     8|  1.4718423|
|      3|   1.0|     7|  0.8495594|
|      4|   3.0|    10|  0.8538911|
|      5|   1.0|    29|  1.5374838|
|      6|   1.0|     2|  1.0139494|
+-------+------+------+-----------+


In [42]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [43]:
rmse = evaluator.evaluate(predictions)

In [44]:
rmse

1.7128230393361559

In [45]:
single_user = test_data.filter(test_data.userId == 11).select("movieId", "userId")

In [46]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|     13|    11|
|     16|    11|
|     19|    11|
|     20|    11|
|     23|    11|
|     27|    11|
|     35|    11|
|     36|    11|
|     41|    11|
|     45|    11|
|     48|    11|
|     50|    11|
|     51|    11|
|     66|    11|
|     69|    11|
|     70|    11|
|     71|    11|
|     72|    11|
|     75|    11|
|     78|    11|
+-------+------+


In [47]:
recommendation = model.transform(single_user)

In [48]:
recommendation.orderBy('prediction', ascending=False).show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|     50|    11|  2.3682444|
|     23|    11|  2.0582218|
|     51|    11|  1.9993123|
|     19|    11|  1.7393816|
|     13|    11|  1.6138629|
|     66|    11|  1.6098038|
|     79|    11|  1.4421184|
|     36|    11|  1.2603539|
|     45|    11|   1.054325|
|     48|    11|  0.7943057|
|     97|    11|  0.6927949|
|     78|    11| 0.66300637|
|     69|    11|  0.5107675|
|     75|    11| 0.44824302|
|     72|    11| 0.35342517|
|     20|    11| 0.28437197|
|     16|    11| 0.24411146|
|     41|    11| 0.23283258|
|     86|    11|-0.14355831|
|     35|    11|-0.30964977|
+-------+------+-----------+
