In [30]:
from pyspark import SparkConf, SparkContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [2]:
conf = conf = SparkConf().setAll([
    ("num-executors", 4), 
    ("total-executor-cores", 16), 
    ("executor-memory", "8g"),
    ("spark.yarn.executor.memoryOverhead", "64g")])
sc = SparkContext(appName = "spark-cl", conf=conf)
spark = SparkSession.builder.getOrCreate()

In [34]:
lines = spark.read.csv("/data/movielens/ratings.csv", header=True)
ratings = lines \
    .withColumn("userId", lines["userId"].cast(IntegerType())) \
    .withColumn("movieId", lines["movieId"].cast(IntegerType())) \
    .withColumn("rating", lines["rating"].cast(DoubleType())) \
    .withColumn("timestamp", lines["timestamp"].cast(LongType()))

In [35]:
ratings.head()

Row(userId=1, movieId=296, rating=5.0, timestamp=1147880044)

In [36]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [38]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)

Root-mean-square error = 0.7612155761973188
