In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('recommend').getOrCreate()

In [3]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

## Import data

In [7]:
data = spark.read.csv('movielens_ratings.csv', inferSchema=True, header=True)

In [13]:
print('No. of row: %d' % data.count())
data.show(5)

No. of row: 1501
+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
+-------+------+------+
only showing top 5 rows



In [38]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [39]:
train_data, test_data = data.randomSplit([0.8, 0.2])

## Build a model

In [42]:
als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating')

In [43]:
model = als.fit(train_data)

In [67]:
print('Factorized user matrix with rank = %d' % model.rank)
model.userFactors.show(5)

print('-'*50)

print('Factorized item matrix with rank = %d' % model.rank)
model.itemFactors.show(5)

Factorized user matrix with rank = 10
+---+--------------------+
| id|            features|
+---+--------------------+
|  0|[-0.34714437, 0.0...|
| 10|[-0.80319965, -0....|
| 20|[-0.21793956, 0.3...|
|  1|[0.8587597, -0.43...|
| 11|[-0.25588533, -0....|
+---+--------------------+
only showing top 5 rows

--------------------------------------------------
Factorized item matrix with rank = 10
+---+--------------------+
| id|            features|
+---+--------------------+
|  0|[-1.8353459, 0.18...|
| 10|[0.7837627, 0.791...|
| 20|[1.853389, 0.8566...|
| 30|[-4.281721, -1.20...|
| 40|[-1.2146126, -1.0...|
+---+--------------------+
only showing top 5 rows



In [74]:
print('Recommended top users (e.g. 1 top user) for all items with the corresponding predicted ratings:')
model.recommendForAllItems(1).show(5)

print('-'*50)

print('Recommended top items (e.g. 1 top item) for all users with the corresponding predicted ratings:')
model.recommendForAllUsers(1).show(5)

Recommended top users (e.g. 1 top user) for all items with the corresponding predicted ratings:
+-------+----------------+
|movieId| recommendations|
+-------+----------------+
|     31| [[8,3.0300753]]|
|     85| [[8,4.6582036]]|
|     65|[[26,2.6746302]]|
|     53| [[8,5.1094904]]|
|     78|[[11,1.3186768]]|
+-------+----------------+
only showing top 5 rows

--------------------------------------------------
Recommended top items (e.g. 1 top item) for all users with the corresponding predicted ratings:
+------+----------------+
|userId| recommendations|
+------+----------------+
|    28| [[32,5.294339]]|
|    26|  [[38,6.77293]]|
|    27|[[49,5.0444503]]|
|    12|[[27,5.2244005]]|
|    22|[[75,5.1005692]]|
+------+----------------+
only showing top 5 rows



## Make predictions on test_data

In [75]:
predictions = model.transform(test_data)

In [76]:
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    27|-0.04621993|
|     31|   4.0|    12| -0.9708025|
|     31|   1.0|     5| 0.41849458|
|     31|   1.0|    29| 0.64804256|
|     85|   5.0|    16|   -0.62506|
|     85|   1.0|     4|  1.2008178|
|     85|   1.0|    23|  1.0882655|
|     85|   4.0|     7|  2.3168426|
|     65|   5.0|    23| 0.72033644|
|     65|   1.0|     2|  0.3198831|
|     53|   1.0|    23|-0.43821186|
|     53|   5.0|    21|  3.3953247|
|     78|   1.0|     1|  1.1885641|
|     78|   1.0|    20|  1.1257566|
|     78|   1.0|    17|  1.2175034|
|     78|   1.0|    24|  0.6432372|
|     34|   1.0|    16|  2.0533729|
|     34|   1.0|    17|   1.565777|
|     34|   1.0|     4|  1.2765543|
|     34|   4.0|     2|-0.10228021|
+-------+------+------+-----------+
only showing top 20 rows



## Evaluate the predictions

In [79]:
# check the root mean squared error
evaluator = RegressionEvaluator(metricName='rmse', predictionCol='prediction', labelCol='rating')

In [80]:
rmse = evaluator.evaluate(predictions)

In [83]:
print('Root mean squared error of the test_data: %.4f' % rmse)

Root mean squared error of the test_data: 1.8931


## Note that the rating system is from 1 to 5, rmse of 1.89 is not good
## Generally a large dataset is required to build a decent recommender system

## Previously we make a prediction on a group of users (several users in test_data), let's see how to make predictions on single user (with some records e.g. rated some movies in the train_data)

In [89]:
# see historical rating of the user
user_history = train_data.filter(train_data['userId']==11)
user_history.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      0|   1.0|    11|
|      6|   2.0|    11|
|      9|   1.0|    11|
|     10|   1.0|    11|
|     11|   1.0|    11|
|     13|   4.0|    11|
|     18|   5.0|    11|
|     19|   4.0|    11|
|     20|   1.0|    11|
|     21|   1.0|    11|
|     22|   1.0|    11|
|     23|   5.0|    11|
|     25|   1.0|    11|
|     27|   5.0|    11|
|     32|   5.0|    11|
|     35|   3.0|    11|
|     36|   2.0|    11|
|     37|   2.0|    11|
|     38|   4.0|    11|
|     39|   1.0|    11|
+-------+------+------+
only showing top 20 rows



In [91]:
# a list of movies we are thinking to offer
user_suggest = test_data.filter(train_data['userId']==11).select(['movieId', 'userId'])
user_suggest.show()

+-------+------+
|movieId|userId|
+-------+------+
|     12|    11|
|     16|    11|
|     30|    11|
|     48|    11|
|     61|    11|
|     70|    11|
|     75|    11|
|     77|    11|
|     79|    11|
|     81|    11|
|     89|    11|
+-------+------+



In [98]:
# offer movies with a high predicted rating
user_offer = model.transform(user_suggest)
user_offer.orderBy('prediction', ascending=False).show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|     89|    11|   4.517553|
|     77|    11|   3.707769|
|     48|    11|  3.5568771|
|     12|    11|   3.282252|
|     81|    11|  3.0455773|
|     70|    11|   3.040622|
|     79|    11|  2.4697256|
|     75|    11|  0.9392534|
|     61|    11|  0.5906359|
|     30|    11| 0.26028907|
|     16|    11|-0.15461397|
+-------+------+-----------+

