In [1]:
# import the required libraries
import time  
import pyspark  
from pyspark.sql import SparkSession  
spark = spark = SparkSession.builder.appName("recommendation") \
    .config("spark.driver.memory", "24g") \
    .config("spark.executor.memory", "38g") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/18 12:52:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/18 12:52:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# load the datasets using pyspark
movies = spark.read.load("../data/raw/movies.csv", format='csv', header = True)
ratings = spark.read.load('../data/raw/ratings.csv', format='csv', header = True)
links = spark.read.load("../data/raw/links.csv", format='csv', header = True)
tags = spark.read.load("../data/raw/tags.csv", format='csv', header = True)
ratings.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|     17|   4.0|944249077|
|     1|     25|   1.0|944250228|
|     1|     29|   2.0|943230976|
|     1|     30|   5.0|944249077|
|     1|     32|   5.0|943228858|
|     1|     34|   2.0|943228491|
|     1|     36|   1.0|944249008|
|     1|     80|   5.0|944248943|
|     1|    110|   3.0|943231119|
|     1|    111|   5.0|944249008|
|     1|    161|   1.0|943231162|
|     1|    166|   5.0|943228442|
|     1|    176|   4.0|944079496|
|     1|    223|   3.0|944082810|
|     1|    232|   5.0|943228442|
|     1|    260|   5.0|943228696|
|     1|    302|   4.0|944253272|
|     1|    306|   5.0|944248888|
|     1|    307|   5.0|944253207|
|     1|    322|   4.0|944053801|
+------+-------+------+---------+
only showing top 20 rows


In [3]:
ratings = ratings.select("userId", "movieId", "rating")
ratings.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)



In [4]:
# convert the data type to integer and float
df = ratings.withColumn('userId', ratings['userId'].cast('int')).\
withColumn('movieId', ratings['movieId'].cast('int')).withColumn('rating', ratings['rating'].cast('float'))
df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)



In [5]:
# split the data into train, validation and test sets
train, validation, test = df.randomSplit([0.6,0.2,0.2], seed = 0)
print("The number of ratings in each set: {}, {}, {}".format(train.count(), validation.count(), test.count()))



The number of ratings in each set: 19195595, 6403105, 6401504


                                                                                

In [6]:
from pyspark.sql.functions import col, sqrt
def RMSE(predictions):
    squared_diff = predictions.withColumn("squared_diff", pow(col("rating") - col("prediction"), 2))
    mse = squared_diff.selectExpr("mean(squared_diff) as mse").first().mse
    return mse ** 0.5

In [7]:
# implement the model using ALS algorithm and find the right hyperparameters using Grid Search
from pyspark.ml.recommendation import ALS

def GridSearch(train, valid, num_iterations, reg_param, n_factors):
    min_rmse = float('inf')
    best_n = -1
    best_reg = 0
    best_model = None
    # run Grid Search for all the parameter defined in the range in a loop
    for n in n_factors:
        for reg in reg_param:
            als = ALS(rank = n, 
                      maxIter = num_iterations, 
                      seed = 0, 
                      regParam = reg,
                      userCol="userId", 
                      itemCol="movieId", 
                      ratingCol="rating", 
                      coldStartStrategy="drop")            
            model = als.fit(train)
            predictions = model.transform(valid)
            rmse = RMSE(predictions)     
            print('{} latent factors and regularization = {}: validation RMSE is {}'.format(n, reg, rmse))
            # track the best model using RMSE
            if rmse < min_rmse:
                min_rmse = rmse
                best_n = n
                best_reg = reg
                best_model = model
                
    pred = best_model.transform(train)
    train_rmse = RMSE(pred)
    # best model and its metrics
    print('\nThe best model has {} latent factors and regularization = {}:'.format(best_n, best_reg))
    print('traning RMSE is {}; validation RMSE is {}'.format(train_rmse, min_rmse))
    return best_model

In [8]:
# build the model using different ranges for Grid Search
from pyspark.sql.functions import col, sqrt
num_iterations = 10
ranks = [6, 8, 10, 12]
reg_params = [0.05, 0.1, 0.2, 0.4, 0.8]

start_time = time.time()
final_model = GridSearch(train, validation, num_iterations, reg_params, ranks)
print('Total Runtime: {:.2f} seconds'.format(time.time() - start_time))

                                                                                

6 latent factors and regularization = 0.05: validation RMSE is 0.8040265470547072


                                                                                

6 latent factors and regularization = 0.1: validation RMSE is 0.8094152985472945


                                                                                

6 latent factors and regularization = 0.2: validation RMSE is 0.8543615543305509


                                                                                

6 latent factors and regularization = 0.4: validation RMSE is 0.9499406319977305


                                                                                

6 latent factors and regularization = 0.8: validation RMSE is 1.1753617359519688


                                                                                

8 latent factors and regularization = 0.05: validation RMSE is 0.79823323001624


                                                                                

8 latent factors and regularization = 0.1: validation RMSE is 0.8059870711087452


                                                                                

8 latent factors and regularization = 0.2: validation RMSE is 0.8538247514175864


                                                                                

8 latent factors and regularization = 0.4: validation RMSE is 0.9489515159238097


                                                                                

8 latent factors and regularization = 0.8: validation RMSE is 1.1753708059067451


                                                                                

10 latent factors and regularization = 0.05: validation RMSE is 0.794012040146468


                                                                                

10 latent factors and regularization = 0.1: validation RMSE is 0.8039336298180445


                                                                                

10 latent factors and regularization = 0.2: validation RMSE is 0.8540944166536334


                                                                                

10 latent factors and regularization = 0.4: validation RMSE is 0.9484889455872131


                                                                                

10 latent factors and regularization = 0.8: validation RMSE is 1.1753810324018505


                                                                                

12 latent factors and regularization = 0.05: validation RMSE is 0.7896918639834328


                                                                                

12 latent factors and regularization = 0.1: validation RMSE is 0.8018040624412729


                                                                                

12 latent factors and regularization = 0.2: validation RMSE is 0.8532478843569731


                                                                                

12 latent factors and regularization = 0.4: validation RMSE is 0.9491584885520659


                                                                                

12 latent factors and regularization = 0.8: validation RMSE is 1.1753641726235446





The best model has 12 latent factors and regularization = 0.05:
traning RMSE is 0.7099878658149557; validation RMSE is 0.7896918639834328
Total Runtime: 851.18 seconds


                                                                                

In [9]:
# test the accuracy of the model on test set using RMSE 
pred_test = final_model.transform(test)
print('The testing RMSE is ' + str(RMSE(pred_test)))



The testing RMSE is 0.7896519280173803


                                                                                

In [11]:
# test for a single user 
single_user = test.filter(test['userId']==12).select(['movieId','userId'])
single_user.show()



+-------+------+
|movieId|userId|
+-------+------+
|     31|    12|
|    724|    12|
|   1022|    12|
|   1704|    12|
|   2858|    12|
|   5620|    12|
+-------+------+



                                                                                

In [12]:
# fetch the names of the movies 
single_user.join(movies, single_user.movieId == movies.movieId, 'inner').show()



+-------+------+-------+--------------------+--------------------+
|movieId|userId|movieId|               title|              genres|
+-------+------+-------+--------------------+--------------------+
|     31|    12|     31|Dangerous Minds (...|               Drama|
|    724|    12|    724|   Craft, The (1996)|Drama|Fantasy|Hor...|
|   1022|    12|   1022|   Cinderella (1950)|Animation|Childre...|
|   1704|    12|   1704|Good Will Hunting...|       Drama|Romance|
|   2858|    12|   2858|American Beauty (...|       Drama|Romance|
|   5620|    12|   5620|Sweet Home Alabam...|      Comedy|Romance|
+-------+------+-------+--------------------+--------------------+



                                                                                

In [13]:
# verify the prediction rating for the user
reccomendations = final_model.transform(single_user)
reccomendations.orderBy('prediction',ascending=False).show()



+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|   5620|    12| 3.6705217|
|     31|    12| 3.4757984|
|   1022|    12|   3.20873|
|    724|    12| 3.0409725|
|   1704|    12| 3.0052028|
|   2858|    12| 2.5917907|
+-------+------+----------+



                                                                                

In [14]:
# fetch the names of the movies
reccomendations.join(movies, reccomendations.movieId == movies.movieId, 'inner').show()



+-------+------+----------+-------+--------------------+--------------------+
|movieId|userId|prediction|movieId|               title|              genres|
+-------+------+----------+-------+--------------------+--------------------+
|     31|    12| 3.4757984|     31|Dangerous Minds (...|               Drama|
|    724|    12| 3.0409725|    724|   Craft, The (1996)|Drama|Fantasy|Hor...|
|   1022|    12|   3.20873|   1022|   Cinderella (1950)|Animation|Childre...|
|   1704|    12| 3.0052028|   1704|Good Will Hunting...|       Drama|Romance|
|   2858|    12| 2.5917907|   2858|American Beauty (...|       Drama|Romance|
|   5620|    12| 3.6705217|   5620|Sweet Home Alabam...|      Comedy|Romance|
+-------+------+----------+-------+--------------------+--------------------+



                                                                                

In [15]:
from pyspark.sql.functions import col, lit

# select a single user from the test set
user_id = 12
single_user_ratings = test.filter(test['userId'] == user_id).select(['movieId', 'userId', 'rating'])

# display the movies the user has liked
print("Movies liked by user with ID", user_id)
single_user_ratings.join(movies, 'movieId').select('movieId', 'title', 'rating').show()

# generate recommendations for the user
all_movies = df.select('movieId').distinct()
user_movies = single_user_ratings.select('movieId').distinct()
movies_to_recommend = all_movies.subtract(user_movies)

# predict ratings for movies the user has not rated yet
recommendations = final_model.transform(movies_to_recommend.withColumn('userId', lit(user_id)))

# filter out the movies that the user has already rated or seen (this filters out the movies that the user has not liked as well)
recommendations = recommendations.filter(col('prediction') > 0)

# display the recommendations with movie names
print("Recommended movies for user with ID", user_id)
recommended_movies = recommendations.join(movies, 'movieId').select('movieId', 'title', 'prediction')

# Sort recommended movies by prediction in descending order
ordered_recommendations = recommended_movies.orderBy(col('prediction').desc())

# Display the ordered recommendations
ordered_recommendations.show()

Movies liked by user with ID 12


                                                                                

+-------+--------------------+------+
|movieId|               title|rating|
+-------+--------------------+------+
|     31|Dangerous Minds (...|   3.0|
|    724|   Craft, The (1996)|   4.5|
|   1022|   Cinderella (1950)|   3.0|
|   1704|Good Will Hunting...|   3.0|
|   2858|American Beauty (...|   5.0|
|   5620|Sweet Home Alabam...|   4.5|
+-------+--------------------+------+

Recommended movies for user with ID 12




+-------+--------------------+----------+
|movieId|               title|prediction|
+-------+--------------------+----------+
| 200252|The Ethereal Mela...|  6.792939|
| 281144|Fantasy Football ...|  5.942049|
| 154921|Future My Love (2...| 5.9267683|
| 185337|Hans Zimmer: Live...| 5.9035535|
| 163783|  Summer of 8 (2016)|  5.875811|
|  94101|Crime After Crime...| 5.8409166|
| 128872|  Love is God (2003)|  5.832617|
| 110138|Aya of Yop City (...|  5.796935|
| 282387|Beyond the Univer...| 5.6659055|
| 149709|Barking at the St...| 5.6479445|
| 265582|The Whole Truth (...|  5.646632|
| 126941|Joni's Promise (2...| 5.6273894|
| 245740|   The Tangle (2021)| 5.6070194|
|  70661|Tyler Perry's Mee...| 5.5952744|
| 138684|         Waar (2013)|  5.594674|
| 162660|God's Not Dead 2 ...| 5.5944133|
| 158338|Tig Notaro: Boyis...| 5.5798864|
| 158518|Under the Gun (2016)| 5.5354614|
| 283549|Johan Falk: Blods...|  5.535195|
| 283553|Johan Falk: Slute...|  5.535195|
+-------+--------------------+----

                                                                                