In [1]:
from pyspark.sql import SparkSession, functions

In [2]:
spark = SparkSession.builder.enableHiveSupport().appName('Recommendation_System').getOrCreate()
sc = spark.sparkContext

In [3]:
data = spark.read.csv('/user/akarshsahu/data/Netflix_Data/processed_data_final.csv', header=True)

In [4]:
data.show(5)

+---+-------+---+---------+-------+----------+----------+-----+---+-------+---+-------------------+---+-------+-----------+------------------------+--------------+---------------+----+----+----+----+----+----+-------------+----+----+----+----+----+
|  3|1025579|4.0|tt0119448|1997.04|Character5|Character6|movie|122|1997.09|  0|Crime,Drama,Mystery|7.7|10128.0|Jan Decleir|Pavlik Jansen op de Haar|Fedja van Huêt|Betty Schuurman|_c18|_c19|_c20|_c21|_c22|_c23|Mike van Diem|_c25|_c26|_c27|_c28|_c29|
+---+-------+---+---------+-------+----------+----------+-----+---+-------+---+-------------------+---+-------+-----------+------------------------+--------------+---------------+----+----+----+----+----+----+-------------+----+----+----+----+----+
|  3| 712664|5.0|tt0119448| 1997.0| Character| Character|movie|122| 1997.0|  0|Crime,Drama,Mystery|7.7|10128.0|Jan Decleir|    Pavlik Jansen op ...|Fedja van Huêt|Betty Schuurman|null|null|null|null|null|null|Mike van Diem|null|null|null|null|null|
|  3

### Creating ALS model

In [11]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [13]:
nd=data.select(data['_c1'],data['_c2'],data['_c3'])
nd.show()

+-------+---+---------+
|    _c1|_c2|      _c3|
+-------+---+---------+
|1025579|4.0|tt0119448|
| 712664|5.0|tt0119448|
|1331154|4.0|tt0119448|
|2632461|3.0|tt0119448|
|  44937|5.0|tt0119448|
| 656399|4.0|tt0119448|
| 439011|1.0|tt0119448|
|1436762|3.0|tt0119448|
|1644750|3.0|tt0119448|
|2031561|4.0|tt0119448|
| 616720|4.0|tt0119448|
|2467008|4.0|tt0119448|
| 975874|5.0|tt0119448|
| 701730|2.0|tt0119448|
|1614320|4.0|tt0119448|
| 115498|3.0|tt0119448|
| 931626|2.0|tt0119448|
| 699878|4.0|tt0119448|
|1694958|3.0|tt0119448|
|  66414|5.0|tt0119448|
+-------+---+---------+
only showing top 20 rows



In [17]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in list(set(nd.columns)-set(['_c2'])) ]
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(nd).transform(nd)
transformed.show()

+-------+---+---------+---------+---------+
|    _c1|_c2|      _c3|_c3_index|_c1_index|
+-------+---+---------+---------+---------+
|1025579|4.0|tt0119448|   3529.0| 234003.0|
| 712664|5.0|tt0119448|   3529.0|   1927.0|
|1331154|4.0|tt0119448|   3529.0|  11994.0|
|2632461|3.0|tt0119448|   3529.0| 125373.0|
|  44937|5.0|tt0119448|   3529.0| 138656.0|
| 656399|4.0|tt0119448|   3529.0| 103925.0|
| 439011|1.0|tt0119448|   3529.0|  43941.0|
|1436762|3.0|tt0119448|   3529.0| 159265.0|
|1644750|3.0|tt0119448|   3529.0|   9420.0|
|2031561|4.0|tt0119448|   3529.0|  30787.0|
| 616720|4.0|tt0119448|   3529.0|  90446.0|
|2467008|4.0|tt0119448|   3529.0|  38565.0|
| 975874|5.0|tt0119448|   3529.0| 284124.0|
| 701730|2.0|tt0119448|   3529.0|  18448.0|
|1614320|4.0|tt0119448|   3529.0|  24964.0|
| 115498|3.0|tt0119448|   3529.0|  60588.0|
| 931626|2.0|tt0119448|   3529.0|  13520.0|
| 699878|4.0|tt0119448|   3529.0|  75755.0|
|1694958|3.0|tt0119448|   3529.0|  69137.0|
|  66414|5.0|tt0119448|   3529.0

In [18]:
transformed = transformed.withColumn('_c2', transformed['_c2'].cast('int'))

Train and test data

In [19]:
(training,test)=transformed.randomSplit([0.8, 0.2])

ALS model

In [20]:
als=ALS(maxIter=5,regParam=0.09,rank=25,userCol="_c3_index",itemCol="_c1_index",ratingCol="_c2",coldStartStrategy="drop",nonnegative=True)
model=als.fit(training)

### Generat predictions and evaluate RMSE

In [21]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="_c2",predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))
predictions.show()

RMSE=0.8658419359735077
+-------+---+---------+---------+---------+----------+
|    _c1|_c2|      _c3|_c3_index|_c1_index|prediction|
+-------+---+---------+---------+---------+----------+
|1272379|  1|tt0050212|    564.0|    148.0|  1.771366|
|1272379|  3|tt0113326|   1641.0|    148.0|    1.6803|
|1272379|  4|tt0165929|   1130.0|    148.0| 1.8520545|
|1272379|  1|tt0116242|   1934.0|    148.0| 1.1835804|
|1272379|  1|tt0133059|   6579.0|    148.0| 1.2428589|
|1272379|  1|tt0120670|   3672.0|    148.0| 1.5757805|
|1272379|  1|tt0099260|   6073.0|    148.0| 1.0831131|
|1272379|  1|tt0069994|   6814.0|    148.0|0.87752444|
|1272379|  2|tt0230030|   1320.0|    148.0| 1.7446809|
|1272379|  2|tt0077928|   1342.0|    148.0| 1.7112339|
|1272379|  1|tt0031971|   2122.0|    148.0|  1.472496|
|1272379|  2|tt0112819|   2142.0|    148.0| 1.5807616|
|1272379|  1|tt0100196|   3749.0|    148.0| 1.4964224|
|1272379|  1|tt0104627|   4900.0|    148.0| 1.6303498|
|1272379|  4|tt0091042|     67.0|    148.

### Providing Recommendations

In [22]:
user_recs=model.recommendForAllUsers(20).show(10)

+---------+--------------------+
|_c3_index|     recommendations|
+---------+--------------------+
|      471|[[462874, 6.39594...|
|     1591|[[471204, 6.22308...|
|     4101|[[475043, 6.07330...|
|     1645|[[462874, 6.04655...|
|     3175|[[475043, 5.85560...|
|     4935|[[477570, 4.57740...|
|      496|[[466645, 5.70809...|
|     2366|[[471204, 6.12524...|
|     2866|[[465951, 5.31449...|
|     5156|[[460741, 5.52778...|
+---------+--------------------+
only showing top 10 rows

