In [1]:
from pyspark.sql import SparkSession

from pyspark.sql.types import *
from pyspark.sql.functions import *

from pyspark.ml import Pipeline

from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.recommendation import ALS

import numpy as np


In [2]:
spark = SparkSession \
    .builder \
    .appName("Recommender") \
    .getOrCreate()

# Importing and Exploring dataset

In [3]:
ratings= spark.read.csv('/Users/Abdelrahman/scrape/Recommender/ml-latest-small/ratings.csv', header= True)

In [4]:
ratings.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [5]:
# calculating sparsity 
numerator= ratings.select('rating').count()

num_users= ratings.select('userId').distinct().count()
num_movies= ratings.select('movieId').distinct().count()

denominator= num_users * num_movies

sparsity=  ((numerator * 1.0)/denominator) * 100

sparsity

1.6999683055613624

In [6]:
# Min num ratings for movies
print("Movie with the fewest ratings: ")
ratings.groupBy("movieId").count().select(min("count")).show()

# Avg num ratings per movie
print("Avg num ratings per movie: ")
ratings.groupBy("movieId").count().select(avg("count")).show()

# Min num ratings for user
print("User with the fewest ratings: ")
ratings.groupBy("userId").count().select(min("count")).show()

# Avg num ratings per users
print("Avg num ratings per user: ")
ratings.groupBy("userId").count().select(avg("count")).show()

Movie with the fewest ratings: 
+----------+
|min(count)|
+----------+
|         1|
+----------+

Avg num ratings per movie: 
+------------------+
|        avg(count)|
+------------------+
|10.369806663924312|
+------------------+

User with the fewest ratings: 
+----------+
|min(count)|
+----------+
|        20|
+----------+

Avg num ratings per user: 
+------------------+
|        avg(count)|
+------------------+
|165.30491803278687|
+------------------+



# Prepairing Dataset

In [7]:
# examining column types
ratings.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [8]:
# converting columns to proper types
ratings = ratings.select(ratings.userId.cast("integer"), ratings.movieId.cast("integer"), ratings.rating.cast("double"))

# examining column types again
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [9]:
ratings.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
|     1|     70|   3.0|
|     1|    101|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
|     1|    157|   5.0|
|     1|    163|   5.0|
|     1|    216|   5.0|
|     1|    223|   3.0|
|     1|    231|   5.0|
|     1|    235|   4.0|
|     1|    260|   5.0|
|     1|    296|   3.0|
|     1|    316|   3.0|
|     1|    333|   5.0|
|     1|    349|   4.0|
+------+-------+------+
only showing top 20 rows



In [10]:
# splitting data
(train, test) = ratings.randomSplit([0.8, 0.2], seed = 1234)

In [11]:
train.count()

80847

# Creating check point spark context

In [12]:
sc = spark.sparkContext

sc.setCheckpointDir('checkpoint')

In [13]:
train.checkpoint()

DataFrame[userId: int, movieId: int, rating: double]

In [14]:
train.explain(extended=True)

== Parsed Logical Plan ==
Sample 0.0, 0.8, false, 1234
+- Sort [userId#156 ASC NULLS FIRST, movieId#157 ASC NULLS FIRST, rating#158 ASC NULLS FIRST], false
   +- Project [cast(userId#16 as int) AS userId#156, cast(movieId#17 as int) AS movieId#157, cast(rating#18 as double) AS rating#158]
      +- Relation[userId#16,movieId#17,rating#18,timestamp#19] csv

== Analyzed Logical Plan ==
userId: int, movieId: int, rating: double
Sample 0.0, 0.8, false, 1234
+- Sort [userId#156 ASC NULLS FIRST, movieId#157 ASC NULLS FIRST, rating#158 ASC NULLS FIRST], false
   +- Project [cast(userId#16 as int) AS userId#156, cast(movieId#17 as int) AS movieId#157, cast(rating#18 as double) AS rating#158]
      +- Relation[userId#16,movieId#17,rating#18,timestamp#19] csv

== Optimized Logical Plan ==
Sample 0.0, 0.8, false, 1234
+- Sort [userId#156 ASC NULLS FIRST, movieId#157 ASC NULLS FIRST, rating#158 ASC NULLS FIRST], false
   +- Project [cast(userId#16 as int) AS userId#156, cast(movieId#17 as int) AS m

# CrossValidation and Evaluation

In [15]:
# instantiate an als model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop", nonnegative = True, implicitPrefs = False)

In [16]:
# Prameter grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100]) \
            .addGrid(als.maxIter, [5, 50, 100]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()
  

In [17]:
# instantiate an evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 


In [18]:
# instantiate a CrosssValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)


In [19]:
# Fit
model = cv.fit(train)

In [20]:
# Extract best model
best_model = model.bestModel

In [21]:
test_predictions = best_model.transform(test)

In [22]:
test_predictions.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   597|    471|   2.0|  4.027004|
|   436|    471|   3.0|  3.496109|
|   218|    471|   4.0| 2.9909446|
|   387|    471|   3.0| 3.0409868|
|   217|    471|   2.0| 2.7600737|
|   287|    471|   4.5| 2.7864587|
|    32|    471|   3.0|  3.736732|
|   260|    471|   4.5| 3.2589998|
|   104|    471|   4.5| 3.4050958|
|   111|   1088|   3.0| 3.2291057|
|   177|   1088|   3.5| 3.5028057|
|    41|   1088|   1.5| 2.5586512|
|   387|   1088|   1.5| 2.6889942|
|   594|   1088|   4.5| 4.3272386|
|   307|   1088|   3.0| 2.7366014|
|   509|   1088|   3.0|  3.089476|
|   104|   1088|   3.0| 3.6422207|
|   268|   1238|   5.0| 3.9473112|
|   462|   1238|   3.5| 3.4617355|
|   307|   1342|   2.0| 1.9624944|
+------+-------+------+----------+
only showing top 20 rows



In [23]:
# evaluation
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

0.871160036601722


An RMSE of 0.87 means that on average the model predicts 0.87 above or below values of the original ratings matrix.