In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ALS').getOrCreate()

In [2]:
from pyspark.sql import Row

ratings_rdd = spark.sparkContext \
                   .textFile('../data/ratings.csv') \
                   .map(lambda line: line.split(',')) \
                   .map(lambda values: Row(user_id=int(values[0]), 
                                           movie_id=int(values[1]), 
                                           rating=float(values[2])))

ratings_df = spark.createDataFrame(ratings_rdd)
ratings_df.show()

+--------+------+-------+
|movie_id|rating|user_id|
+--------+------+-------+
|      31|   2.5|      1|
|    1029|   3.0|      1|
|    1061|   3.0|      1|
|    1129|   2.0|      1|
|    1172|   4.0|      1|
|    1263|   2.0|      1|
|    1287|   2.0|      1|
|    1293|   2.0|      1|
|    1339|   3.5|      1|
|    1343|   2.0|      1|
|    1371|   2.5|      1|
|    1405|   1.0|      1|
|    1953|   4.0|      1|
|    2105|   4.0|      1|
|    2150|   3.0|      1|
|    2193|   2.0|      1|
|    2294|   2.0|      1|
|    2455|   2.5|      1|
|    2968|   1.0|      1|
|    3671|   3.0|      1|
+--------+------+-------+
only showing top 20 rows



In [5]:
from pyspark.ml.recommendation import ALS

(training, test) = ratings_df.randomSplit([0.8, 0.2])

# TODO: train ALS model with rank=8, maxIter=10 and nonnegative=True
als = ALS(rank=8, maxIter=10, nonnegative=True, userCol='user_id', itemCol='movie_id', ratingCol='rating')

als_model = als.fit(training)

In [12]:
prediction = als_model.transform(training)

In [13]:
prediction.show()

+--------+------+-------+----------+
|movie_id|rating|user_id|prediction|
+--------+------+-------+----------+
|     148|   4.0|    575|  3.885124|
|     463|   4.0|    232| 3.8574328|
|     463|   2.0|    452| 2.3836129|
|     463|   3.0|    380| 3.0937757|
|     463|   4.0|    242| 3.7774878|
|     463|   4.0|     30| 3.5408144|
|     471|   3.0|     85| 3.0263326|
|     471|   5.0|    126| 3.9464946|
|     471|   3.0|    350| 3.5577257|
|     471|   3.0|    602|  3.979167|
|     471|   5.0|    285|  4.015639|
|     471|   5.0|    274| 3.8304222|
|     471|   3.0|    440| 3.0133495|
|     471|   4.0|     86| 4.2924595|
|     471|   3.0|    306| 3.2781227|
|     471|   3.0|     19| 4.0349836|
|     471|   4.0|     92|   3.93749|
|     471|   4.5|    299|  4.564701|
|     471|   4.0|    309| 4.2943807|
|     471|   4.0|    607| 3.4679222|
+--------+------+-------+----------+
only showing top 20 rows



In [18]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import isnan

# TODO: make predictions for test data

prediction = als_model.transform(test)

evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='rating',
                                predictionCol='prediction')

print(evaluator.evaluate(prediction.filter(~isnan('prediction'))))

# TODO: evaluate predictions with RegressionEvaluator
# NOTE: remove records with isnan('prediction') == True

0.917013558202422


In [17]:
prediction.filter(isnan('prediction')).show()

+--------+------+-------+----------+
|movie_id|rating|user_id|prediction|
+--------+------+-------+----------+
|    4101|   4.5|    580|       NaN|
|   32460|   5.0|    298|       NaN|
|  141422|   2.5|    624|       NaN|
|   51927|   3.0|    584|       NaN|
|    2776|   3.0|    436|       NaN|
|    6559|   4.0|     73|       NaN|
|   34435|   4.5|    572|       NaN|
|   61250|   5.0|    599|       NaN|
|     251|   3.0|    516|       NaN|
|    2923|   4.0|    597|       NaN|
|    4000|   3.0|    444|       NaN|
|   36533|   2.5|    384|       NaN|
|      53|   5.0|    344|       NaN|
|    3790|   4.0|    407|       NaN|
|    4092|   2.5|     77|       NaN|
|  134158|   1.5|    624|       NaN|
|   26350|   4.5|    547|       NaN|
|    1133|   3.0|    318|       NaN|
|    2625|   4.0|    346|       NaN|
|    4725|   4.5|     17|       NaN|
+--------+------+-------+----------+
only showing top 20 rows

