Import libraries

In [1]:
from pyspark.sql import SparkSession

In [2]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
import findspark
#findspark.init()
import pandas as pd

Import SparkSession from pyspark.sql

Create an instance of SparkSession

In [3]:
spark = SparkSession.builder.appName('recommender').getOrCreate()

Print the tables in the catalog

In [4]:
print(spark.catalog.listTables())

[]


Load the file

In [5]:
file_path = "/home/alainkuiete/Documents/DATA612/ml-20m/ratings.csv"

Read the rating data

In [6]:
ratings = spark.read.csv(file_path, header=True)

Show the ratings data

In [7]:
ratings.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      2|   3.5|1112486027|
|     1|     29|   3.5|1112484676|
|     1|     32|   3.5|1112484819|
|     1|     47|   3.5|1112484727|
|     1|     50|   3.5|1112484580|
|     1|    112|   3.5|1094785740|
|     1|    151|   4.0|1094785734|
|     1|    223|   4.0|1112485573|
|     1|    253|   4.0|1112484940|
|     1|    260|   4.0|1112484826|
|     1|    293|   4.0|1112484703|
|     1|    296|   4.0|1112484767|
|     1|    318|   4.0|1112484798|
|     1|    337|   3.5|1094785709|
|     1|    367|   3.5|1112485980|
|     1|    541|   4.0|1112484603|
|     1|    589|   3.5|1112485557|
|     1|    593|   3.5|1112484661|
|     1|    653|   3.0|1094785691|
|     1|    919|   3.5|1094785621|
+------+-------+------+----------+
only showing top 20 rows



Add ratings to catalog

In [8]:
ratings.createOrReplaceTempView("ratings")

Look at the type of each column

In [9]:
ratings.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



Cast the columns to integers

In [10]:
ratings = ratings.withColumn("userId", ratings.userId.cast("integer"))
ratings = ratings.withColumn("movieId", ratings.movieId.cast("integer"))
ratings = ratings.withColumn("rating", ratings.rating.cast("float"))

In [11]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)
 |-- timestamp: string (nullable = true)



Eliminate the timestamp column in ratings dataframe

In [12]:
ratings = ratings.select(['userId', 'movieId', 'rating'])

Summirized Statistics on the data ratings

In [13]:
ratings.describe().show()

+-------+-----------------+-----------------+------------------+
|summary|           userId|          movieId|            rating|
+-------+-----------------+-----------------+------------------+
|  count|         20000263|         20000263|          20000263|
|   mean|69045.87258292554|9041.567330339605|3.5255285642993797|
| stddev|40038.62665316182|19789.47744541297|  1.05198891929425|
|    min|                1|                1|               0.5|
|    max|           138493|           131262|               5.0|
+-------+-----------------+-----------------+------------------+



Splitting the data into train and test sets

In [14]:
training, test = ratings.randomSplit([0.8,0.2])

Create ALS model

In [15]:
als = ALS(userCol='userId', itemCol='movieId', ratingCol='rating', coldStartStrategy='drop', nonnegative=True)


Tune model using paramGrid

In [23]:
param_grid = ParamGridBuilder().addGrid(als.rank, [12, 13, 14]).addGrid(als.maxIter, [18, 19, 20]).addGrid(als.regParam, [.17, .18, .19]).build()

RMSE evaluator

In [27]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol="prediction")

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:46765)
Traceback (most recent call last):
  File "/home/alainkuiete/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/alainkuiete/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:46765)

Bulid the cross validation

In [25]:
tvs = TrainValidationSplit(estimator=als, estimatorParamMaps= param_grid, evaluator=evaluator)

Fit ALS to the training data

In [26]:
model = tvs.fit(training)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:46765)
Traceback (most recent call last):
  File "/home/alainkuiete/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/alainkuiete/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:46765)

Extract the best model from the tuning exercices

In [None]:
best_model = model.bestModel

Summary statistic of predictions


Generate predictions

In [None]:
predictions = best_model.transform(test)

In [None]:
predictions.describe().show()

Evaluate with RMSE

In [None]:
rmse = evaluator.evaluate(predictions)

Evaluation metric and model parametres

In [None]:
print("RMSE = " + str(rmse))
print("Best Model")
print("RANK: ", best_model.rank)
print("MaxIter: ", best_model._java_obj.parent().getMaxIter())
print("RegParam", best_model._java_obj.parent().get(RegParam()))

Create a mapping for movie ids

In [None]:
predictions = predictions.na.drop()
predictions.describe().show()

In [None]:

Evaluation

In [None]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating')
rmse = evaluator.evaluate(predictions)
rmse