# Anime Recommended System

## Team member
  - Nguyễn Quốc Bảo - 19133002
  - Võ Hoàng Khả Diệu - 19133014
  
  This notebook explains how to use the [Anime Datasets]() to build a movie recommender using [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) with [Spark's Alternating Least Saqures](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.recommendation.ALS.html) implementation. It is organised in two parts. The first one is about getting and parsing movies and ratings data into Spark RDDs. The second is about building and using the recommender and persisting it for later use in our on-line recommender system. 

## Getting and processing the data

In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql.functions import col,isnan, when, count
import pandas as pd


In [3]:
# Khởi tạo một sparkSession
#spark = SparkSession.builder \
#    .appName("ALS recommendation spark session") \
#    .config("spark.driver.memory", "16g") \
#    .config("spark.executor.memory", "25g") \
#    .config('spark.cores.max', '16') \
#    .enableHiveSupport() \
#    .getOrCreate()

#    .master('spark://192.168.1.171:7077') \
#    .config("spark.driver.host", "192.168.1.171") \
#    .config("spark.driver.port", "10027") \
#    .config("spark.submit.deployMode", "cluster") \
#    .config("spark.driver.bindAddress", "0.0.0.0") \
#    .config("spark.dynamicAllocation.enabled", False) \

In [2]:
spark = SparkSession.builder \
    .appName("Anime recommendation") \
    .getOrCreate()

In [3]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)
import os

In [185]:
datasets_path = os.path.join('../data')
rating_file_path = os.path.join(datasets_path, 'rating.csv')
rating_change_file_path = os.path.join(datasets_path, 'rating_change.csv');
rating_raw_RDD = sc.textFile(rating_file_path)
anime_file_path = os.path.join(datasets_path, 'anime.csv')
anime_raw_RDD = sc.textFile(anime_file_path)

In [6]:
rating_data_raw_header = rating_raw_RDD.take(1)[0]
anime_data_raw_header = anime_raw_RDD.take(1)[0]

In [7]:
rating_RDD = rating_raw_RDD.filter(lambda line: line!=rating_data_raw_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]), int(tokens[1]), int(tokens[2]))).cache()#
anime_title_RDD = anime_raw_RDD.filter(lambda line: line!=anime_data_raw_header)\
    .map(lambda line: line.split(",")).map(lambda x: (int(x[0]), x[1])).cache()

In [11]:
rating_RDD.take(5)

[(1, 20, -1), (1, 24, -1), (1, 79, -1), (1, 226, -1), (1, 241, -1)]

In [12]:
anime_title_RDD.take(5)

[(32281, 'Kimi no Na wa.'),
 (5114, 'Fullmetal Alchemist: Brotherhood'),
 (28977, 'Gintama°'),
 (9253, 'Steins;Gate'),
 (9969, 'Gintama&#039;')]

In [13]:
def change_rating(rating):
    if(rating == 6):
        return 1
    if(rating == 7):
        return 2
    if(rating == 8):
        return 3
    if(rating == 9):
        return 4
    if(rating == 10):
        return 5

In [14]:
#Test rating from 1 to 10
#rating_RDD_data = rating_RDD.filter(lambda line: line!=rating_data_raw_header)\
#    .filter(lambda x: x[2] != -1)

In [15]:
#Test rating from 6 to 10
rating_RDD_data = rating_RDD.filter(lambda line: line!=rating_data_raw_header)\
    .filter(lambda x: x[2] != -1 and x[2] != 1 and x[2] != 2 and x[2] != 3 and x[2] != 4 and x[2] != 5)\
    .map(lambda x: (int(x[0]), int(x[1]), int(change_rating(x[2]))))

In [16]:
rating_RDD_data.take(10)

[(1, 8074, 5),
 (1, 11617, 5),
 (1, 11757, 5),
 (1, 15451, 5),
 (2, 11771, 5),
 (3, 20, 3),
 (3, 154, 1),
 (3, 170, 4),
 (3, 199, 5),
 (3, 225, 4)]

In [17]:
rating_RDD_data.count()

5868892

In [18]:
#create dataframe from rdd
ratings_df = spark.createDataFrame(data = rating_RDD_data, schema = ["user_id", "anime_id", "rating"])

In [19]:
ratings_df.show(5)

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
|      1|    8074|     5|
|      1|   11617|     5|
|      1|   11757|     5|
|      1|   15451|     5|
|      2|   11771|     5|
+-------+--------+------+
only showing top 5 rows



## Building and using the Recommended system

### Collaborative Filtering - ALS

#### Building and training the model

In [20]:
# Chia tập train & tập test theo tỉ lệ 8 : 2
(training, testing) = ratings_df.randomSplit([0.8, 0.2])


In [21]:
# Xây dựng mô hình recommendation sử dụng thuật toán ALS trên tập dữ liệu huấn luyện
from datetime import datetime
start_time = datetime.now()
als = ALS(maxIter=10, regParam=0.1, userCol="user_id", itemCol="anime_id", ratingCol="rating")
model = als.fit(training)
end_time = datetime.now()

print('Execute time {}'.format(end_time - start_time))

Execute time 0:00:52.676939


#### Turning hyperparameter

In [22]:
ALSExplicit = ALS(implicitPrefs=False, userCol="user_id", itemCol="anime_id", ratingCol="rating",
          coldStartStrategy="drop")
defaultModel = ALSExplicit.fit(training)

In [23]:
paramMapExplicit = ParamGridBuilder() \
                    .addGrid(ALSExplicit.rank, [ 8, 12]) \
                    .addGrid(ALSExplicit.maxIter, [5,10]) \
                    .addGrid(ALSExplicit.regParam, [0.01,0.001]) \
                    .addGrid(ALSExplicit.alpha, [2.0,3.0]) \
                    .build()

In [24]:
evaluatorR = RegressionEvaluator(metricName="rmse", labelCol="rating")

In [25]:
# Run cross-validation, and choose the best set of parameters.
CVALSExplicit = CrossValidator(estimator=ALSExplicit,
                            estimatorParamMaps=paramMapExplicit,
                            evaluator=evaluatorR,
                           numFolds=5)

In [None]:
CVModelEXplicit = CVALSExplicit.fit(training)

In [29]:
# Test
model.setColdStartStrategy("drop");
predictions = model.transform(testing)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                               predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.9363156782800602


#### Using the model for recommended system

In [58]:
#Recommended: top 10 movies for a user
user_subset = ratings_df.where(ratings_df.user_id == 215)
user_subset_recs = model.recommendForUserSubset(user_subset, 10)
list_user_predictions = list(user_subset_recs.select('recommendations').toPandas()['recommendations'])
user_prediction_rdd = sc.parallelize(list_user_predictions[0])

In [61]:
#Join name of Anime into a new list
list_complete_user_prediction = user_prediction_rdd.join(anime_title_RDD) \
    .map(lambda x: (x[0], x[1][1], x[1][0])) \
    .takeOrdered(10, key=lambda x: -x[2])

In [71]:
# Change list to dataframe
user_subset_recs_columns = ["anime_id","name", "rating"]
user_subset_recs_DF = spark.createDataFrame(data=list_complete_user_prediction, schema = user_subset_recs_columns)

In [125]:
# Show output
print("Top 10 anime recommended for userID is " + str(user_subset.collect()[0][0]))
print(user_subset_recs_DF.show(10))

Top 10 anime recommended for userID is 215
+--------+--------------------+-----------------+
|anime_id|                name|           rating|
+--------+--------------------+-----------------+
|    7416|              Socket|6.557953357696533|
|   32400|           KochinPa!|6.261995792388916|
|   29978|                 001|6.244216442108154|
|   29995|The Embryo Develo...|6.110090732574463|
|    7485|      Urashima Tarou|6.042207717895508|
|   22059|Kakumeiteki Broad...|5.638498306274414|
|   22615|Kero Kero Keroppi...|5.526596546173096|
|   22445|Hello Kitty no Ya...|5.526596546173096|
|   17985|Kero Kero Keroppi...|5.526596546173096|
|   22607|Ahiru no Pekkle n...|5.526596546173096|
+--------+--------------------+-----------------+

None


#### Persisting the model

In [122]:
model_path = os.path.join('model') + "/als_model"
model.save(model_path)

### Content-base Filtering

In [165]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
from sklearn.linear_model import Ridge

In [186]:
#Reading anime file:
animeDF = pd.read_csv(anime_file_path, sep=',', encoding='latin-1')
#Reading rating file:
ratingDF = pd.read_csv(rating_file_path, sep=',', encoding='latin-1')

MemoryError: Unable to allocate 2.00 MiB for an array with shape (262144,) and data type int64

Process nan data

In [None]:
animeDF.dropna(inplace=True)
ratingDF.dropna(inplace=True)

In [None]:
#Get n_users
n_users = ratingDF["user_id"].unique().shape[0]
#Config ratingDF
ratingDF = ratingDF[ratingDF["rating"] > 5]
ratingDF.loc[ratingDF["rating"] == 6, "rating"] = 1
ratingDF.loc[ratingDF["rating"] == 7, "rating"] = 2
ratingDF.loc[ratingDF["rating"] == 8, "rating"] = 3
ratingDF.loc[ratingDF["rating"] == 9, "rating"] = 4
ratingDF.loc[ratingDF["rating"] == 10, "rating"] = 5

In [183]:
ratingDF

Unnamed: 0,user_id,anime_id,rating,gender
47,1,8074,10,5.0
81,1,11617,10,5.0
83,1,11757,10,5.0
101,1,15451,10,5.0
153,2,11771,10,5.0
...,...,...,...,...
7813732,73515,16512,7,2.0
7813733,73515,17187,9,4.0
7813734,73515,22145,10,5.0
7813735,73516,790,9,4.0


TF-IDF

In [168]:
tf = TfidfVectorizer(analyzer=lambda s: (c for i in range(1,2)
                                             for c in combinations(s.split(', '), r=i)))
tfidf_matrix = tf.fit_transform(animeDF['genre'])
tfidf_matrix.shape[1]

43

In [159]:
#tf.get_feature_names()

In [160]:
tdidf_matrix_df = pd.DataFrame(tfidf_matrix.todense(), columns=tf.get_feature_names(), index=animeDF.name)

In [161]:
tdidf_matrix_df

Unnamed: 0_level_0,"(Action,)","(Adventure,)","(Cars,)","(Comedy,)","(Dementia,)","(Demons,)","(Drama,)","(Ecchi,)","(Fantasy,)","(Game,)",...,"(Shounen Ai,)","(Slice of Life,)","(Space,)","(Sports,)","(Super Power,)","(Supernatural,)","(Thriller,)","(Vampire,)","(Yaoi,)","(Yuri,)"
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Kimi no Na wa.,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.439008,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.545474,0.00000,0.0,0.0,0.0
Fullmetal Alchemist: Brotherhood,0.294537,0.315806,0.0,0.000000,0.0,0.0,0.334685,0.0,0.31968,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0
GintamaÂ°,0.262021,0.000000,0.0,0.208687,0.0,0.0,0.000000,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0
Steins;Gate,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.90569,0.0,0.0,0.0
Gintama&#039;,0.262021,0.000000,0.0,0.208687,0.0,0.0,0.000000,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Toushindai My Lover: Minami tai Mecha-Minami,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0
Under World,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0
Violence Gekiga David no Hoshi,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0
Violence Gekiga Shin David no Hoshi: Inma Densetsu,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0


In [None]:
def get_items_rated_by_user(rate_matrix, user_id):
    y = rate_matrix[:,0]
    ids = np.where(y == user_id +1)[0]
    item_ids = rate_matrix[ids, 1] - 1
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

In [None]:
d = tfidf.shape[1] # data dimension
W = np.zeros((d, self.n_users))
b = np.zeros((1, self.n_users))
for n in range(self.n_users):    
    ids, scores = get_items_rated_by_user(self.Y, n)
    clf = Ridge(alpha= self.lamda, fit_intercept  = True)
    Xhat = tfidf[ids, :]

    clf.fit(Xhat, scores) 
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_