In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [6]:
!wget -q https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz

In [8]:
!tar xf spark-3.3.2-bin-hadoop3.tgz

In [9]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"

In [10]:
!pip install -q findspark

In [11]:
import findspark

# Initiate findspark
findspark.init()
# Check the location for Spark
findspark.find()

'/content/spark-3.3.2-bin-hadoop3'

In [30]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import col, explode

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import pandas as pd

### Initialize spark session

In [13]:
spark = SparkSession.builder.appName('Recommendation').getOrCreate()
sc = SparkContext

### Load dataset

In [14]:
movies = spark.read.csv('/content/movies.csv', header=True)
ratings = spark.read.csv('/content/ratings.csv', header=True)

ratings.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [15]:
ratings.printSchema()

ratings = ratings.withColumn('userId', col('userId').cast('integer')).\
                  withColumn('movieId', col('movieId').cast('integer')).\
                  withColumn('rating', col('rating').cast('float')).\
                  drop('timestamp')

ratings.show()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
|     1|     70|   3.0|
|     1|    101|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
|     1|    157|   5.0|
|     1|    163|   5.0|
|     1|    216|   5.0|
|     1|    223|   3.0|
|     1|    231|   5.0|
|     1|    235|   4.0|
|     1|    260|   5.0|
|     1|    296|   3.0|
|     1|    316|   3.0|
|     1|    333|   5.0|
|     1|    349|   4.0|
+------+-------+------+
only showing top 20 rows



In [16]:
movies.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

### Calculate sparsity

In [17]:
numerator = ratings.select('rating').count()

num_users = ratings.select('userId').distinct().count()
num_movies = ratings.select('movieId').distinct().count()

denominator = num_movies * num_users

sparsity = (1.0 - (numerator * 1.0)/denominator) * 100

print("The rating dataframe is ", "%.2f" % sparsity + "% empty.")

The rating dataframe is  98.30% empty.


### Data exploration

In [18]:
userId_ratings = ratings.groupBy('userId').count().orderBy('count', ascending=False)
userId_ratings.show()

+------+-----+
|userId|count|
+------+-----+
|   414| 2698|
|   599| 2478|
|   474| 2108|
|   448| 1864|
|   274| 1346|
|   610| 1302|
|    68| 1260|
|   380| 1218|
|   606| 1115|
|   288| 1055|
|   249| 1046|
|   387| 1027|
|   182|  977|
|   307|  975|
|   603|  943|
|   298|  939|
|   177|  904|
|   318|  879|
|   232|  862|
|   480|  836|
+------+-----+
only showing top 20 rows



In [19]:
movieId_ratings = ratings.groupBy('movieId').count().orderBy('count', ascending=False)
movieId_ratings.show()

+-------+-----+
|movieId|count|
+-------+-----+
|    356|  329|
|    318|  317|
|    296|  307|
|    593|  279|
|   2571|  278|
|    260|  251|
|    480|  238|
|    110|  237|
|    589|  224|
|    527|  220|
|   2959|  218|
|      1|  215|
|   1196|  211|
|   2858|  204|
|     50|  204|
|     47|  203|
|    780|  202|
|    150|  201|
|   1198|  200|
|   4993|  198|
+-------+-----+
only showing top 20 rows



### ALS Model build

In [20]:
train, test = ratings.randomSplit([0.8, 0.2], seed=1)

als = ALS(userCol='userId',
          itemCol='movieId',
          ratingCol='rating',
          nonnegative=True, implicitPrefs=False, coldStartStrategy='drop')

# type(als)

#### Hyperparameter tuning

In [21]:
param_grid = ParamGridBuilder() \
             .addGrid(als.rank, [10, 50, 100, 150]) \
             .addGrid(als.regParam, [.01, .05, .1, .15]) \
             .build()

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

print("Number of models to test: ", len(param_grid))

Number of models to test:  16


#### Cross-validation pipeline

In [22]:
cv = CrossValidator(estimator=als,
                    estimatorParamMaps=param_grid,
                    evaluator=evaluator,
                    numFolds=5)

print(cv)

CrossValidator_a872538b2df6


#### Model training

In [23]:
model = cv.fit(train)

best_model = model.bestModel

### Select best model

In [24]:
print(type(best_model))

print("Best model ->")

print("  Rank: ", best_model._java_obj.parent().getRank())

print("  MaxIter: ", best_model._java_obj.parent().getMaxIter())

print("  RegParam: ", best_model._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
Best model ->
  Rank:  50
  MaxIter:  10
  RegParam:  0.15


### Test model

In [25]:
test_predictions = best_model.transform(test)

RMSE = evaluator.evaluate(test_predictions)

print(RMSE)

0.8629447957489936


In [26]:
test_predictions.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   580|  44022|   3.5| 3.1228669|
|   362|   1591|   4.0| 2.8110528|
|    34|   1580|   2.5| 3.5601945|
|    34|   3997|   2.0| 1.1864082|
|   368|   2122|   2.0| 2.1366324|
|   368|   2366|   4.0| 3.0434473|
|   115|   3175|   4.0| 3.4754665|
|   332|   2366|   3.5| 3.4818144|
|   577|   1580|   3.0| 3.2790654|
|   577|   2366|   3.0| 3.3370287|
|   606|  44022|   4.0| 2.6731396|
|   388|  44022|   4.5| 3.0867321|
|    91|    471|   1.0| 2.8665233|
|   230|   1580|   3.5| 2.9090571|
|    93|   1591|   4.0| 3.0772955|
|   232|   1580|   3.5| 3.4207573|
|   232|  36525|   3.0| 2.4240954|
|   246|   1645|   4.0| 3.6306412|
|   599|   1591|   2.5| 2.0009441|
|   599|   3175|   3.0|  2.873227|
+------+-------+------+----------+
only showing top 20 rows



In [27]:
!mkdir models

In [28]:
model.save('/content/models/ALS_model')

### Model evaluation and recommendation

In [34]:
n_recommendations = best_model.recommendForAllUsers(10)
n_recommendations.limit(10).show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{3379, 5.736997}...|
|     3|[{6835, 4.835305}...|
|     5|[{6201, 4.4295135...|
|     6|[{33649, 4.888212...|
|     9|[{3379, 4.7077045...|
|    12|[{67618, 5.604050...|
|    13|[{3379, 5.1179504...|
|    15|[{7842, 4.4468236...|
|    16|[{3379, 4.443803}...|
|    17|[{3379, 5.170812}...|
+------+--------------------+



In [None]:
n_recommendations = n_recommendations.withColumn('rec_exp', explode('recommendations')) \
                                     .select('userId', col('rec_exp.movieId'), col("rec_exp.rating"))

n_recommendations.limit(10).show()

In [38]:
n_recommendations.join(movies, on='movieId').filter('userId = 100').show()

+-------+------+---------+--------------------+--------------------+
|movieId|userId|   rating|               title|              genres|
+-------+------+---------+--------------------+--------------------+
|  33649|   100|5.1769004|  Saving Face (2004)|Comedy|Drama|Romance|
|  67618|   100| 5.127819|Strictly Sexual (...|Comedy|Drama|Romance|
|  42730|   100|4.9616914|   Glory Road (2006)|               Drama|
|  74226|   100|4.9400554|Dream of Light (a...|   Documentary|Drama|
|  26073|   100|4.9400554|Human Condition I...|           Drama|War|
| 179135|   100|4.9400554|Blue Planet II (2...|         Documentary|
|   7071|   100|4.9400554|Woman Under the I...|               Drama|
| 134796|   100|4.9400554|  Bitter Lake (2015)|         Documentary|
| 138966|   100|4.9400554|Nasu: Summer in A...|           Animation|
| 184245|   100|4.9400554|De platte jungle ...|         Documentary|
+-------+------+---------+--------------------+--------------------+



In [39]:
ratings.join(movies, on='movieId').filter('userId = 100').sort('rating', ascending=False).limit(10).show()

+-------+------+------+--------------------+--------------------+
|movieId|userId|rating|               title|              genres|
+-------+------+------+--------------------+--------------------+
|   1101|   100|   5.0|      Top Gun (1986)|      Action|Romance|
|   1958|   100|   5.0|Terms of Endearme...|        Comedy|Drama|
|   2423|   100|   5.0|Christmas Vacatio...|              Comedy|
|   4041|   100|   5.0|Officer and a Gen...|       Drama|Romance|
|   5620|   100|   5.0|Sweet Home Alabam...|      Comedy|Romance|
|    368|   100|   4.5|     Maverick (1994)|Adventure|Comedy|...|
|    934|   100|   4.5|Father of the Bri...|              Comedy|
|    539|   100|   4.5|Sleepless in Seat...|Comedy|Drama|Romance|
|     16|   100|   4.5|       Casino (1995)|         Crime|Drama|
|    553|   100|   4.5|    Tombstone (1993)|Action|Drama|Western|
+-------+------+------+--------------------+--------------------+

