In [4]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql.functions import col,isnan, when, count

In [5]:
# Khởi tạo một sparkSession
spark = SparkSession.builder \
    .master('spark://192.168.1.24:7077') \
    .appName("ALS recommendation spark session") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "25g") \
    .config('spark.cores.max', '16') \
    .enableHiveSupport() \
    .getOrCreate()

In [7]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)
import os

In [8]:
datasets_path = 'D:/MON_HOC/TLCN/data'
rating_file_path = os.path.join(datasets_path, 'rating.csv')
rating_raw_RDD = sc.textFile(rating_file_path)
anime_file_path = os.path.join(datasets_path, 'anime.csv')
anime_raw_RDD = sc.textFile(anime_file_path)

In [None]:
rating_data_raw_header = rating_raw_RDD.take(1)[0]
anime_data_raw_header = anime_raw_RDD.take(1)[0]

In [None]:
rating_RDD = rating_raw_RDD.filter(lambda line: line!=rating_data_raw_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]), int(tokens[1]), int(tokens[2])))
anime_RDD = anime_raw_RDD.filter(lambda line: line!=anime_data_raw_header)\
    .map(lambda line: line.split(",")).map(lambda x: (int(x[0]), x[1], x[2], x[3], int(x[4]), float(x[5]), int(x[6])))
anime_title_RDD = anime_RDD.map(lambda x: (int(x[0]), x[1]))

In [None]:
rating_RDD.take(5)

In [None]:
anime_title_RDD.take(5)

In [None]:
def change_rating(rating):
    if(rating == 6):
        return 1
    if(rating == 7):
        return 2
    if(rating == 8):
        return 3
    if(rating == 9):
        return 4
    if(rating == 10):
        return 5

In [None]:
#Test rating from 1 to 10
#rating_RDD_data = rating_RDD.filter(lambda line: line!=rating_data_raw_header)\
#    .filter(lambda x: x[2] != -1)

In [None]:
#Test rating from 6 to 10
rating_RDD_data = rating_RDD.filter(lambda line: line!=rating_data_raw_header)\
    .filter(lambda x: x[2] != -1 and x[2] != 1 and x[2] != 2 and x[2] != 3 and x[2] != 4 and x[2] != 5)\
    .map(lambda x: (int(x[0]), int(x[1]), int(change_rating(x[2]))))

In [None]:
rating_RDD_data.take(10)

In [None]:
rating_RDD_data.count()

In [None]:
#create header for rdd
rating_RDD_complete_data = rating_RDD_data.map(lambda p: Row(user_id=int(p[0]), anime_id=int(p[1]),
                                    rating=int(p[2])))
anime_RDD_complete_data = anime_title_RDD.map(lambda p: Row(anime_id=int(p[0]), name=p[1]))

In [None]:
#create dataframe from rdd
ratings_df = spark.createDataFrame(rating_RDD_complete_data)
anime_df = spark.createDataFrame(anime_RDD_complete_data)

In [None]:
ratings_df.show(5)

In [None]:
anime_df.show(5)

In [None]:
(training, testing) = ratings_df.randomSplit([0.8, 0.2])

# Training model

In [92]:
# Xây dựng mô hình recommendation sử dụng thuật toán ALS trên tập dữ liệu huấn luyện
from datetime import datetime
start_time = datetime.now()
als = ALS(maxIter=10, regParam=0.1, userCol="user_id", itemCol="anime_id", ratingCol="rating")
model = als.fit(training)
end_time = datetime.now()

print('Execute time {}'.format(end_time - start_time))

Execute time 0:01:01.982784


In [93]:
model.setColdStartStrategy("drop");
predictions = model.transform(testing)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                               predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.9366923035028061


In [94]:
#def get_anime_id_name_rating(recommendedDF):
#    animeDF = spark.createDataFrame(data = anime_title_RDD, schema = ["anime_id", "name"])
#    return recommendedDF.join(animeDF, recommendedDF.anime_id == animeDF.anime_id, "leftouter")

In [95]:
#Recommended: top 10 movies for user
#Config userId need to recommended
usersId = [215] #input
users_RDD = rating_RDD_complete_data.filter(lambda x: x[0] in usersId)
users_df = spark.createDataFrame(users_RDD)
users_df = users_df.select(als.getUserCol()).distinct()
users_df.show(5)

+-------+
|user_id|
+-------+
|    215|
+-------+



In [97]:
#Get top 10
userSubsetRecs = model.recommendForUserSubset(users_df, 10)

In [98]:
list_user_predictions = list(userSubsetRecs.select('recommendations').toPandas()['recommendations'])

In [127]:
rdd = sc.parallelize(list_user_predictions[0])

In [135]:
user_predictions_df = spark.createDataFrame(rdd, schema=['anime_id', 'rating'])

In [140]:
user_predictions_df.show(5)

+--------+-----------------+
|anime_id|           rating|
+--------+-----------------+
|   32400|6.928781032562256|
|    7485|6.805344581604004|
|   30743|6.732007026672363|
|    8353|6.387811183929443|
|   29978|6.241982936859131|
+--------+-----------------+
only showing top 5 rows



In [None]:
user_predictions_join_df = user_predictions_df.join(anime_df, user_predictions_df.anime_id == anime_df.anime_id, 'left')

In [None]:
user_predictions_join_df.rdd.take(10)