# **Problem STATEMENT**

In this part, our goal is bring recommendation book to user using Spark library. Let's started!

In [None]:
# Install spark
!pip install pyspark

In [2]:
# Preparing dataset
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d zygmunt/goodbooks-10k

Downloading goodbooks-10k.zip to /content
 43% 5.00M/11.6M [00:00<00:00, 50.4MB/s]
100% 11.6M/11.6M [00:00<00:00, 74.0MB/s]


In [3]:
# Unzipping dataset
!unzip "goodbooks-10k.zip"

Archive:  goodbooks-10k.zip
  inflating: book_tags.csv           
  inflating: books.csv               
  inflating: ratings.csv             
  inflating: sample_book.xml         
  inflating: tags.csv                
  inflating: to_read.csv             


In [5]:
# Import useful library
import pyspark as ps
from pyspark.sql import SQLContext, Row
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import udf, col, when
import numpy as np


In [6]:
# Create Spark environment
spark = ps.sql.SparkSession.builder \
            .master("local") \
            .appName("Machine Learning Spark") \
            .getOrCreate()

In [7]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [8]:
# Read dataset
sdf_rating = spark.read.csv('ratings.csv', 
                            header=True, 
                            inferSchema=True)
sdf_rating.show(3, False)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|1      |314    |5     |
|1      |439    |3     |
|1      |588    |5     |
+-------+-------+------+
only showing top 3 rows



In [9]:
sdf_books = spark.read.csv('books.csv',
                           header=True,
                           inferSchema=True)
sdf_books.show(3, False)

+---+-------+------------+-------+-----------+---------+----------------+---------------------------+-------------------------+----------------------------------------+--------------------------------------------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+----------------------------------------------------------+----------------------------------------------------------+
|id |book_id|best_book_id|work_id|books_count|isbn     |isbn13          |authors                    |original_publication_year|original_title                          |title                                                   |language_code|average_rating|ratings_count|work_ratings_count|work_text_reviews_count|ratings_1|ratings_2|ratings_3|ratings_4|ratings_5|image_url                                                 |small_image_url                                           |
+---+-------+------------+-------+------

# **Modelling Phase**

In [10]:
# Split dataset
sdf_train, sdf_test = sdf_rating.randomSplit([.8, .2])

In [12]:
# Build ALS model
iterations = 10
regularization_parameter = 0.1

# rank 4
als = ALS(maxIter=iterations,
          regParam=regularization_parameter,
          rank = 4, userCol="user_id",
          itemCol="book_id", ratingCol="rating")
model = als.fit(sdf_train)
pred = model.transform(sdf_test).filter(col('prediction') != np.nan)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(pred)
print("Root-mean-square-error = " + str(rmse))

Root-mean-square-error = 0.8979673369961977


In [15]:
# rank 5
als = ALS(maxIter=iterations,
          regParam=regularization_parameter,
          rank = 5, userCol="user_id",
          itemCol="book_id", ratingCol="rating")
model = als.fit(sdf_train)
pred = model.transform(sdf_test).filter(col('prediction') != np.nan)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(pred)
print("Root-mean-square-error = " + str(rmse))

Root-mean-square-error = 0.8997429357828679


In [14]:
# rank 4-10
for rank in range(4,10):
  als = ALS(maxIter=iterations,
          regParam=regularization_parameter,
          rank = rank, userCol="user_id",
          itemCol="book_id", ratingCol="rating")
  model = als.fit(sdf_train)
  pred = model.transform(sdf_test).filter(col('prediction') != np.nan)
  evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
  rmse = evaluator.evaluate(pred)
  print("Rank :", rank ,"Root-mean-square-error = " + str(rmse))

Rank : 4 Root-mean-square-error = 0.8979673369961977
Rank : 5 Root-mean-square-error = 0.8997429357828679
Rank : 6 Root-mean-square-error = 0.8999864445486688
Rank : 7 Root-mean-square-error = 0.9033660150167959
Rank : 8 Root-mean-square-error = 0.9069299511944662
Rank : 9 Root-mean-square-error = 0.9055029841724308


In [23]:
als = ALS(maxIter=iterations, regParam=regularization_parameter,
          rank=rank, userCol="user_id", itemCol="book_id", 
          ratingCol="rating")
paramGrid = ParamGridBuilder() \
    .addGrid(als.regParam, [0.1, 0.01, 0.18]) \
    .addGrid(als.rank, range(4, 6)) \
    .build()
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
crossval = CrossValidator(estimator=als,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)
cvModel = crossval.fit(sdf_train)

In [24]:
# the best algoritma
cvModel_pred = cvModel.transform(sdf_test).filter(col('prediction') != np.nan)
rmse = evaluator.evaluate(cvModel_pred)
print("The optimal RMSE with cross validation is: {}".format(rmse))

The optimal RMSE with cross validation is: 0.8979673369961977


In [25]:
final_als = ALS(maxIter=10, regParam=0.1, rank=4, userCol="user_id", itemCol="book_id", ratingCol="rating")
final_model = final_als.fit(sdf_train)

In [26]:
preds = final_model.transform(sdf_test)
preds.show(5)

+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|    148|  28767|     3| 2.8824027|
|    148|  24326|     5|  3.294993|
|    148|  13879|     3| 3.4587102|
|    148|  20967|     3| 3.9145834|
|    148|   8440|     3| 3.1117263|
+-------+-------+------+----------+
only showing top 5 rows



In [27]:
preds.join(sdf_books, "book_id").select("user_id","title","prediction").show(5)

+-------+----------+----------+
|user_id|     title|prediction|
+-------+----------+----------+
|  19526|Lysistrata|  4.499846|
|  41963|Lysistrata| 4.2382565|
|  38734|Lysistrata| 4.4324064|
|  21724|Lysistrata|  3.917287|
|  27512|Lysistrata|  3.522049|
+-------+----------+----------+
only showing top 5 rows



In [30]:
exp_recom = preds.filter(col("user_id")==35982).join(sdf_books, "book_id").select("user_id","title","image_url","prediction")
exp_recom.show()

+-------+--------------------+--------------------+----------+
|user_id|               title|           image_url|prediction|
+-------+--------------------+--------------------+----------+
|  35982|The Lost Continen...|https://images.gr...| 2.6817763|
|  35982|Another Bullshit ...|https://s.gr-asse...|  3.767463|
|  35982|Harry Potter and ...|https://images.gr...| 3.9121242|
|  35982|City of Glass (Th...|https://images.gr...| 3.8655407|
|  35982|The Iliad/The Ody...|https://s.gr-asse...| 3.3226616|
|  35982|           The Lover|https://images.gr...| 3.9339528|
+-------+--------------------+--------------------+----------+



In [32]:
# Generate top 5 books recommendations for each user
userRecomments = final_model.recommendForAllUsers(5)
# Generate top 5 user recommendations for each book
bookRecomments = final_model.recommendForAllItems(5)

In [33]:
userRecomments.select("user_id","recommendations.book_id").show(10, False)

+-------+------------------------------+
|user_id|book_id                       |
+-------+------------------------------+
|148    |[4868, 5207, 9076, 9842, 4483]|
|463    |[1338, 2, 4154, 6457, 6902]   |
|471    |[1338, 4154, 3239, 5753, 7550]|
|496    |[9842, 7947, 8854, 2236, 9182]|
|833    |[6590, 4154, 8286, 3498, 3248]|
|1088   |[4653, 3124, 4868, 3628, 6018]|
|1238   |[8326, 1577, 6990, 9946, 7988]|
|1342   |[4868, 8362, 3628, 2590, 4]   |
|1580   |[4868, 3628, 6590, 4653, 5207]|
|1591   |[6924, 8233, 4509, 9460, 3953]|
+-------+------------------------------+
only showing top 10 rows



In [34]:
bookRecomments.select("book_id","recommendations.user_id").show(10, False)

+-------+-----------------------------------+
|book_id|user_id                            |
+-------+-----------------------------------+
|1580   |[30776, 28818, 3655, 25796, 44978] |
|4900   |[38076, 43980, 22822, 33923, 50580]|
|5300   |[29218, 15449, 29748, 38076, 44225]|
|6620   |[38076, 46213, 28818, 26717, 34995]|
|7240   |[38076, 30816, 12621, 22822, 28953]|
|7340   |[38076, 25796, 28818, 44978, 3655] |
|7880   |[38076, 22822, 15449, 30757, 47145]|
|9900   |[38076, 11963, 12621, 3655, 38404] |
|471    |[38076, 43980, 50307, 22822, 19752]|
|1591   |[30776, 25796, 44978, 28818, 3655] |
+-------+-----------------------------------+
only showing top 10 rows



In [35]:
# Generate top 10 Book recommendations for a specified set of users
users = sdf_rating.select("user_id").distinct().limit(3);
users.show()

+-------+
|user_id|
+-------+
|  32592|
|  19984|
|  35982|
+-------+



In [36]:
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.select("user_id","recommendations.book_id").show(10, False)

+-------+-----------------------------------------------------------+
|user_id|book_id                                                    |
+-------+-----------------------------------------------------------+
|32592  |[5207, 4868, 562, 862, 6920, 1010, 3628, 3753, 3275, 5730] |
|35982  |[8109, 3124, 5202, 5207, 7283, 3628, 983, 1895, 7264, 7593]|
|19984  |[9076, 4868, 6902, 6920, 422, 5207, 862, 6590, 562, 9566]  |
+-------+-----------------------------------------------------------+



In [37]:
# Generate top 10 user recommendations for a specified set of books
book_id = sdf_rating.select("book_id").distinct().limit(3)
book_id.show()

+-------+
|book_id|
+-------+
|    148|
|    463|
|    471|
+-------+



In [39]:
bookSubSetRecs = model.recommendForItemSubset(book_id, 10)
book_ids = [860,1524,2885,2914,5297,7397,8802,9506]
user_ids = [4917,4917,4917,4917,4917,4917,4917,4917]
new_user_preds = sqlContext.createDataFrame(zip(book_ids, user_ids), schema=['book_id','user_id'])
new_user_preds.show()

+-------+-------+
|book_id|user_id|
+-------+-------+
|    860|   4917|
|   1524|   4917|
|   2885|   4917|
|   2914|   4917|
|   5297|   4917|
|   7397|   4917|
|   8802|   4917|
|   9506|   4917|
+-------+-------+



In [40]:
new_predictions = model.transform(new_user_preds)
new_predictions.show()

+-------+-------+----------+
|book_id|user_id|prediction|
+-------+-------+----------+
|   2914|   4917| 3.4217443|
|    860|   4917|  3.525492|
|   2885|   4917| 3.4023051|
|   7397|   4917| 3.5740724|
|   8802|   4917|  3.468423|
|   9506|   4917| 3.5452433|
|   1524|   4917| 3.5558898|
|   5297|   4917|  3.639408|
+-------+-------+----------+

