# Big Data Platforms

## Movie Recommender System - Collaborative Filtering


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('MovieRecommender').getOrCreate()
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
from pyspark.sql import HiveContext
hive_context = HiveContext(sc)

In [4]:
df = hive_context.table("akarshsahu.processed_dataset_v3")

In [5]:
df.columns

['Movie_Id',
 'Cust_Id',
 'Rating',
 'tconst',
 'Year',
 'Name',
 'primaryTitle',
 'titleType',
 'runtimeMinutes',
 'startYear',
 'isAdult',
 'genres',
 'averageRating',
 'numVotes',
 'Actor_0',
 'Actor_1',
 'Actor_2',
 'Actor_3',
 'Actor_4',
 'Actor_5',
 'Actor_6',
 'Actor_7',
 'Actor_8',
 'Actor_9',
 'Director_0',
 'Director_1',
 'Director_2',
 'Director_3',
 'Director_4',
 'Director_5',
 'genres_split',
 'Act_Cri_Thr_Myst_Hor',
 'Ad_Fan_Sci_Ani',
 'Bio_War_Hist_Doc',
 'Mus_Musi_Fam',
 'Noir_West',
 'Ad_Rom',
 'Drama_n',
 'Comedy_n',
 'TopActor',
 'TopDirector']

In [None]:
movie_title = df.select(['Movie_Id', 'Name']).grou

In [38]:
# Add cluster column here
ratings = df.select(['Cust_Id', 'Movie_Id', 'Name', 'Rating'])
ratings.show()

+-------+--------+---------+------+
|Cust_Id|Movie_Id|     Name|Rating|
+-------+--------+---------+------+
| 770129|    7706|Silverado|   4.0|
|1931185|    7706|Silverado|   5.0|
| 250166|    7706|Silverado|   5.0|
| 328654|    7706|Silverado|   3.0|
|2414873|    7706|Silverado|   4.0|
|1239283|    7706|Silverado|   2.0|
|1057021|    7706|Silverado|   4.0|
|2633985|    7706|Silverado|   3.0|
|2419562|    7706|Silverado|   5.0|
|1047788|    7706|Silverado|   2.0|
| 701678|    7706|Silverado|   5.0|
|1474480|    7706|Silverado|   3.0|
| 415540|    7706|Silverado|   3.0|
| 396505|    7706|Silverado|   1.0|
|2371038|    7706|Silverado|   1.0|
| 372730|    7706|Silverado|   3.0|
|  32668|    7706|Silverado|   4.0|
|2140045|    7706|Silverado|   4.0|
| 933207|    7706|Silverado|   4.0|
|2362542|    7706|Silverado|   4.0|
+-------+--------+---------+------+
only showing top 20 rows



In [39]:
ratings.describe().show()

+-------+------------------+-----------------+---------------+------------------+
|summary|           Cust_Id|         Movie_Id|           Name|            Rating|
+-------+------------------+-----------------+---------------+------------------+
|  count|          79000155|         79000155|       79000155|          79000155|
|   mean|1322168.4876444482|9120.794157517792|       Infinity| 3.573533608889755|
| stddev| 764548.8841645754|5148.096802537221|            NaN|1.0637526570249234|
|    min|                 6|                3|'Round Midnight|               1.0|
|    max|           2649429|            17770|           Zulu|               5.0|
+-------+------------------+-----------------+---------------+------------------+



In [9]:
#Split dataset to train and test
train_data, test_data = ratings.randomSplit([0.8, 0.2])

In [10]:
# Build the recommendation model using ALS on the training data
als = ALS(maxIter=10, regParam=0.1, rank=8, nonnegative=True, coldStartStrategy="drop",\
          userCol='Cust_Id', itemCol='Movie_Id', ratingCol='Rating')
model = als.fit(train_data)

In [19]:
# Save Model
model.save("/user/akarshsahu/data/Netflix_Data/spark_als_model")

In [24]:
print('Factorized user matrix with rank = %d' % model.rank)
model.userFactors.show(5)

print('-'*50)

print('Factorized item matrix with rank = %d' % model.rank)
model.itemFactors.show(5)

Factorized user matrix with rank = 8
+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[1.1271898, 0.191...|
|130|[1.8679975, 0.305...|
|250|[0.7332476, 0.0, ...|
|310|[1.2313715, 0.767...|
|330|[1.4153334, 0.164...|
+---+--------------------+
only showing top 5 rows

--------------------------------------------------
Factorized item matrix with rank = 8
+---+--------------------+
| id|            features|
+---+--------------------+
| 20|[1.3715138, 0.759...|
| 30|[1.3613921, 0.287...|
| 50|[1.2634494, 0.448...|
| 80|[1.3023789, 0.434...|
| 90|[1.4615467, 0.334...|
+---+--------------------+
only showing top 5 rows



In [25]:
print('Recommended top users (e.g. 1 top user) for all items with the corresponding predicted ratings:')
model.recommendForAllItems(1).show(5)

print('-'*50)

print('Recommended top items (e.g. 1 top item) for all users with the corresponding predicted ratings:')
model.recommendForAllUsers(1).show(5)

Recommended top users (e.g. 1 top user) for all items with the corresponding predicted ratings:
+--------+--------------------+
|Movie_Id|     recommendations|
+--------+--------------------+
|    2122|[[1058808, 7.2608...|
|    7982|[[1058808, 4.8454...|
|    8592|[[1058808, 5.1052...|
|    9852|[[1058808, 5.3239...|
|   14832|[[1058808, 6.2738...|
+--------+--------------------+
only showing top 5 rows

--------------------------------------------------
Recommended top items (e.g. 1 top item) for all users with the corresponding predicted ratings:
+-------+--------------------+
|Cust_Id|     recommendations|
+-------+--------------------+
|    471| [[12293, 4.247688]]|
|   1088|[[12293, 4.5709414]]|
|   2122|[[12293, 4.8451962]]|
|   2142|[[14103, 4.5778913]]|
|   2659| [[12293, 4.880624]]|
+-------+--------------------+
only showing top 5 rows



In [26]:
# Make Predictions
#Let see how the model perform
predictions = model.transform(test_data)
predictions.show()

+--------+-------+------+---------+----+--------------+--------------+---------+--------------+---------+-------+-------------+-------------+--------+------------+---------------+------------+------------+-------+-------+-------+-------+-------+-------+------------+----------+----------+----------+----------+----------+----------------+--------------------+--------------+----------------+------------+---------+------+-------+--------+--------+-----------+----------+
|Movie_Id|Cust_Id|Rating|   tconst|Year|          Name|  primaryTitle|titleType|runtimeMinutes|startYear|isAdult|       genres|averageRating|numVotes|     Actor_0|        Actor_1|     Actor_2|     Actor_3|Actor_4|Actor_5|Actor_6|Actor_7|Actor_8|Actor_9|  Director_0|Director_1|Director_2|Director_3|Director_4|Director_5|    genres_split|Act_Cri_Thr_Myst_Hor|Ad_Fan_Sci_Ani|Bio_War_Hist_Doc|Mus_Musi_Fam|Noir_West|Ad_Rom|Drama_n|Comedy_n|TopActor|TopDirector|prediction|
+--------+-------+------+---------+----+--------------+---

In [28]:
# Model Evaluation

# check the root mean squared error
evaluator = RegressionEvaluator(metricName='rmse', predictionCol='prediction', labelCol='Rating')
rmse = evaluator.evaluate(predictions)
print('Root mean squared error of the test_data: %.4f' % rmse)

Root mean squared error of the test_data: 0.8654


In [34]:
# see historical rating of the user
user_history = train_data.filter(train_data['Cust_Id']==770129).select(['Cust_Id', 'Movie_Id', 'Name', 'Rating'])
user_history.show()

+-------+--------+--------------------+------+
|Cust_Id|Movie_Id|                Name|Rating|
+-------+--------+--------------------+------+
| 770129|     571|     American Beauty|   5.0|
| 770129|    4640|            Rain Man|   5.0|
| 770129|    7706|           Silverado|   4.0|
| 770129|    8840|      A Time to Kill|   5.0|
| 770129|    8904|   Good Will Hunting|   5.0|
| 770129|    9121|              Apache|   5.0|
| 770129|    9702|              Tarzan|   4.0|
| 770129|    9788|                Antz|   5.0|
| 770129|   12309|         The Postman|   5.0|
| 770129|   12546|          Home Alone|   3.0|
| 770129|   13082|           Chinatown|   5.0|
| 770129|   14815|       Hang 'Em High|   4.0|
| 770129|   14909|      City of Angels|   5.0|
| 770129|   16552|           GoldenEye|   5.0|
| 770129|   16668|      A Few Good Men|   5.0|
| 770129|    2456|A Fistful of Dollars|   4.0|
| 770129|    4100|            Dinosaur|   4.0|
| 770129|    4123|         Patch Adams|   4.0|
| 770129|    

In [35]:
# a list of movies we are thinking to offer
user_suggest = test_data.filter(train_data['Cust_Id']==770129).select(['Movie_Id','Name', 'Cust_Id'])
user_suggest.show()

+--------+--------------------+-------+
|Movie_Id|                Name|Cust_Id|
+--------+--------------------+-------+
|    3684|          Goldfinger| 770129|
|    6692|          Entrapment| 770129|
|   10886|          Waterworld| 770129|
|   16879|             Titanic| 770129|
|    6972|          Armageddon| 770129|
|    8357|The Bridges of Ma...| 770129|
|   14358|Monty Python and ...| 770129|
|   15922|        The Graduate| 770129|
+--------+--------------------+-------+



In [36]:
# offer movies with a high predicted rating
user_offer = model.transform(user_suggest)
user_offer.orderBy('prediction', ascending=False).show()

+--------+--------------------+-------+----------+
|Movie_Id|                Name|Cust_Id|prediction|
+--------+--------------------+-------+----------+
|   16879|             Titanic| 770129| 4.3558435|
|   14358|Monty Python and ...| 770129| 4.3242383|
|    3684|          Goldfinger| 770129|  4.267573|
|    6972|          Armageddon| 770129|  4.124872|
|   15922|        The Graduate| 770129|  4.101463|
|    8357|The Bridges of Ma...| 770129| 3.9335575|
|    6692|          Entrapment| 770129| 3.9262018|
|   10886|          Waterworld| 770129| 3.2920074|
+--------+--------------------+-------+----------+



In [37]:
## RECOMMENDATIONS FOR ALL USERS - TOP 5

# Generate top 5 movie recommendations for each user
userRecs = model.recommendForAllUsers(5)
userRecs.show(10)

KeyboardInterrupt: 