In [18]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('movielens_ratings.csv').getOrCreate()


In [5]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [6]:
data = spark.read.csv('movielens_ratings.csv',inferSchema=True,header=True)

In [7]:
data.head()

Row(movieId=2, rating=3.0, userId=0)

In [8]:
data.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)



In [9]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [12]:
(train_data, test_data) = data.randomSplit([0.8, 0.2], seed=42)

In [34]:
# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")
model = als.fit(train_data)

In [35]:
predictions = model.transform(test_data)
#Evaluation of the model

In [37]:
predictions.show(10)

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|      2|   4.0|    28|  2.5897458|
|      0|   1.0|    27|   2.126813|
|      0|   1.0|     6|  1.1123132|
|      1|   1.0|     6|  0.6465726|
|      2|   1.0|     3|  1.2940022|
|      0|   1.0|    20| -0.4973111|
|      1|   1.0|    19|-0.02451837|
|      0|   1.0|    15|   -1.15178|
|      2|   1.0|    23| -1.8205764|
|      2|   4.0|    10|  1.0216205|
+-------+------+------+-----------+
only showing top 10 rows



In [46]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
#calculate how far are the predicted observations


In [47]:
rmse = evaluator.evaluate(predictions)
#RMSE indicates how good the model
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.5848124771517471


In [48]:
#how to check for a new user
single_user = test_data.filter(test_data['userId']==11).select(['movieId','userId'])

In [49]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|     11|    11|
|     32|    11|
|     59|    11|
|     62|    11|
|     66|    11|
|     67|    11|
+-------+------+



In [50]:
reccomendations = model.transform(single_user)

In [51]:
reccomendations.orderBy('prediction',ascending=False).show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|     32|    11|  4.472936|
|     66|    11| 3.2859223|
|     59|    11|  2.859868|
|     62|    11|  2.851833|
|     67|    11| 1.0817742|
|     11|    11|-0.8392538|
+-------+------+----------+

