In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import *



In [None]:
#lines = spark.read.csv("hdfs:/user/data/ratings.csv", header="true",inferSchema="true").rdd
#lines = spark.read.csv("gs://dataset-rs/ml-20m/ratings.csv", header="true",inferSchema="true").rdd
lines = spark.read.csv("/home/aleja/Documentos/datasets/ml-20m/ratings.csv", header="true",inferSchema="true").rdd

lines.take(2)

In [None]:

ratings = spark.createDataFrame(lines) 

ratings

In [None]:
(training, test) = ratings.randomSplit([0.8, 0.2])
training.printSchema()


In [None]:
training.show(2)

In [None]:
training.filter(training['rating'] > 4).show(10)

ALS
-numBlocks is the number of blocks the users and items will be partitioned into in order to parallelize computation (defaults to 10).
-rank is the number of latent factors in the model (defaults to 10).
-maxIter is the maximum number of iterations to run (defaults to 10).
-regParam specifies the regularization parameter in ALS (defaults to 1.0).
-implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data (defaults to false which means using explicit feedback).
-alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations (defaults to 1.0).
-nonnegative specifies whether or not to use nonnegative constraints for least squares (defaults to false).


In [None]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop")


In [None]:
model = als.fit(training)
model.rank

In [None]:
#para usar despues
predictions_Data= test.select(test.columns[:2])

In [None]:
predictions_Data = sorted(predictions_Data.collect(), key=lambda r:r[0])

In [None]:
predictions_Data[0]

In [None]:
predictions = model.transform(test)

In [None]:
predictions.show()

In [None]:
# Evaluate the model by computing the RMSE on the test data


evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions.na.drop())
print("Root-mean-square error = " + str(rmse))

In [None]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

In [None]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.show()

In [None]:
movies_data = spark.read.csv("/home/aleja/Documentos/datasets/ml-20m/movies.csv", header="true",inferSchema="true").rdd
movies = spark.createDataFrame(movies_data)
movies

In [None]:
from pyspark.sql.functions import lit



def recommendMovies(model, user, nbRecommendations):
     # Create a Spark DataFrame with the specified user and all the movies listed in the ratings DataFrame
    dataSet = ratings.select("movieId").distinct().withColumn("userId", lit(user))

    # Create a Spark DataFrame with the movies that have already been rated by this user
    moviesAlreadyRated = ratings.filter(ratings.userId == user).select("movieId", "userId")

    # Apply the recommender system to the data set without the already rated movies to predict ratings
    predictions = model.transform(dataSet.subtract(moviesAlreadyRated)).dropna().orderBy("prediction", ascending=False).limit(nbRecommendations).select("movieId", "prediction")
    
    # Join with the movies DataFrame to get the movies titles and genres
    recommendations = predictions.join(movies, predictions.movieId == movies.movieId).select(predictions.movieId, movies.title, movies.genres, predictions.prediction)

    recommendations.show(truncate=False)

In [None]:
print ("Recommendations for user 133:")
recommendMovies(model, 133, 10)

Now we can use the various prediction functions on the model variable.

*predict*: - Return a single floating point value
*predictAll*: -Returns RDD of Rating Objects
*recommendUsers*: -Returns a List of Ratings in Descending Order by Rating
*recommendProducts*: -Returns a List of Ratings
*recommendProductsForUsers*:-Returns RDD with(UserID, (RatingObj, RatingObj, …) ) where RatingObj is sorted descending by rating
*recommendUsersforProducts*:-Returns RDD with(ProductID, (RatingObj, RatingObj, …) ) where RatingObj is sorted descending by rating
