In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('recom').getOrCreate()

In [2]:
from pyspark.ml.recommendation import ALS

In [3]:
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
data = spark.read.csv('movielens_ratings.csv',header=True,inferSchema=True)

In [5]:
data.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
|     23|   1.0|     0|
|     26|   3.0|     0|
|     27|   1.0|     0|
|     28|   1.0|     0|
|     29|   1.0|     0|
|     30|   1.0|     0|
|     31|   1.0|     0|
|     34|   1.0|     0|
|     37|   1.0|     0|
|     41|   2.0|     0|
+-------+------+------+
only showing top 20 rows



In [6]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [7]:
training, test = data.randomSplit([.8,.2])

In [8]:
als = ALS(maxIter=5,regParam=0.01,userCol='userId',ratingCol='rating', itemCol='movieId')

In [9]:
als_model=als.fit(training)

In [10]:
pred_test = als_model.transform(test)

In [11]:
pred_test.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    13|-0.22576779|
|     31|   1.0|     5| -0.5225444|
|     31|   1.0|     4| 0.18168804|
|     31|   3.0|    14| 0.25251493|
|     85|   1.0|    13|-0.80420864|
|     85|   2.0|    20|-0.44871017|
|     85|   1.0|     5| -3.4787598|
|     85|   1.0|    25| -0.8043405|
|     53|   3.0|    13|-0.25549573|
|     53|   1.0|    25| 0.40353042|
|     78|   1.0|    28|  1.3839146|
|     78|   1.0|    17|  1.0791323|
|     78|   1.0|     4| 0.81540424|
|     78|   1.0|    11|  0.7843313|
|     34|   1.0|     4|  2.9641347|
|     34|   1.0|     0|  0.2660641|
|     81|   1.0|    22|  2.9559643|
|     81|   1.0|     1|   2.780429|
|     81|   2.0|     5|  4.1439023|
|     81|   1.0|    19|  1.7642806|
+-------+------+------+-----------+
only showing top 20 rows



In [12]:
elav= RegressionEvaluator(metricName='rmse',labelCol='rating',predictionCol='prediction')

In [13]:
elav.evaluate(pred_test)

1.7082580600897657

## predicting for single user

In [14]:
single_user =test.filter(test['userId']==11).select(['movieId','userId'])

In [15]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      0|    11|
|     51|    11|
|     61|    11|
|     72|    11|
|     78|    11|
|     86|    11|
|     99|    11|
+-------+------+



In [16]:
recomendation =als_model.transform(single_user)

In [17]:
recomendation.orderBy('prediction',ascending=False).show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|     72|    11|  1.8268257|
|     99|    11|  1.3539603|
|     78|    11|  0.7843313|
|     51|    11| 0.19157623|
|      0|    11|-0.28648537|
|     86|    11| -0.3908227|
|     61|    11| -1.1996689|
+-------+------+-----------+

