In [1]:
import pyspark as ps
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.sql import Row
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import udf, col, when
import numpy as np

In [2]:

from IPython.display import Image
from IPython.display import display


spark = ps.sql.SparkSession.builder \
                .master("local") \
                .appName("Book Rec") \
                .getOrCreate()

sc = spark.sparkContext
sqlContext = SQLContext(sc)


In [3]:
ratings_df = spark.read.csv('Goodbooks-10k/ratings.csv', header=True, inferSchema=True)
ratings_df.printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



In [4]:
ratings_df.show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
+-------+-------+------+
only showing top 5 rows



In [5]:
books_df = spark.read.csv('Goodbooks-10k/books.csv', header=True, inferSchema=True)
books_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- best_book_id: integer (nullable = true)
 |-- work_id: integer (nullable = true)
 |-- books_count: integer (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: double (nullable = true)
 |-- authors: string (nullable = true)
 |-- original_publication_year: double (nullable = true)
 |-- original_title: string (nullable = true)
 |-- title: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- work_ratings_count: string (nullable = true)
 |-- work_text_reviews_count: string (nullable = true)
 |-- ratings_1: double (nullable = true)
 |-- ratings_2: integer (nullable = true)
 |-- ratings_3: integer (nullable = true)
 |-- ratings_4: integer (nullable = true)
 |-- ratings_5: integer (nullable = true)
 |-- image_url: string (nullable = true)
 |-- small_image_url: string (nullable = true)


In [6]:
books_df.show(2)

+---+-------+------------+-------+-----------+---------+----------------+--------------------+-------------------------+--------------------+--------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+--------------------+--------------------+
| id|book_id|best_book_id|work_id|books_count|     isbn|          isbn13|             authors|original_publication_year|      original_title|               title|language_code|average_rating|ratings_count|work_ratings_count|work_text_reviews_count|ratings_1|ratings_2|ratings_3|ratings_4|ratings_5|           image_url|     small_image_url|
+---+-------+------------+-------+-----------+---------+----------------+--------------------+-------------------------+--------------------+--------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+--------------------

In [7]:
training_df, validation_df = ratings_df.randomSplit([.8, .2])

In [8]:
iterations = 10
regularization_parameter = 0.1 #lambda
rank = 4
errors = []
err = 0

In [36]:
for rank in range(4, 7):
    als = ALS(maxIter=iterations, regParam=regularization_parameter,
              rank=rank, userCol="user_id", itemCol="book_id", ratingCol = "rating")
    model = als.fit(training_df)
    predictions = model.transform(validation_df)
    new_predictions = predictions.filter(col('prediction') != np.nan)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(new_predictions)
    print("Root Mean Square Error value = " + str(rmse))


Root Mean Square Error value = 0.8952427791886635
Root Mean Square Error value = 0.89792841095617
Root Mean Square Error value = 0.9047418660370171


In [11]:
als = ALS(maxIter=iterations, regParam=regularization_parameter,
          rank=7, userCol="user_id", itemCol="book_id", ratingCol = "rating")
model = als.fit(training_df)
predictions = model.transform(validation_df)
new_predictions = predictions.filter(col('prediction') != np.nan)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(new_predictions)
print("Root Mean Square Error value = " + str(rmse))


Root Mean Square Error value = 0.9077230458318135


In [12]:
predictions = model.transform(validation_df)
predictions.show( n = 10)

+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|    148|   9731|     3| 3.0708532|
|    148|   3922|     3| 3.7186213|
|    148|  12466|     4|  3.371048|
|    148|  22164|     3| 3.2223485|
|    148|   5461|     4| 3.6156464|
|    148|  13407|     4| 3.9823918|
|    148|  10111|     3| 3.5474524|
|    148|  27934|     4| 3.1013389|
|    148|   8510|     3|  3.286783|
|    148|  11239|     2| 2.4927168|
+-------+-------+------+----------+
only showing top 10 rows



In [13]:
predictions.join(books_df, "book_id").select("user_id","title","prediction").show(5)

+-------+----------+----------+
|user_id|     title|prediction|
+-------+----------+----------+
|  19526|Lysistrata|  4.306162|
|   6213|Lysistrata| 3.0298452|
|  41963|Lysistrata| 4.0285654|
|   3721|Lysistrata| 3.8255231|
|  10136|Lysistrata| 3.7900023|
+-------+----------+----------+
only showing top 5 rows



In [14]:
for_one_user = predictions.filter(col("user_id")==35982).join(books_df, "book_id")
.select("user_id","title","image_url", "prediction")
for_one_user.show()

+-------+--------------------+--------------------+----------+
|user_id|               title|           image_url|prediction|
+-------+--------------------+--------------------+----------+
|  35982|Stranger in a Str...|https://images.gr...|  3.769382|
|  35982|The Language Inst...|https://s.gr-asse...| 2.5349014|
|  35982|Harry Potter and ...|https://images.gr...|  4.054604|
|  35982|City of Glass (Th...|https://images.gr...| 4.0813217|
|  35982|The Lord of the R...|https://s.gr-asse...| 3.8985984|
+-------+--------------------+--------------------+----------+



In [15]:
for book in for_one_user.take(10):
    print(book.title)
    display(Image(url=book.image_url))

Stranger in a Strange Land


The Language Instinct: How the Mind Creates Language


Harry Potter and the Goblet of Fire (Harry Potter, #4)


City of Glass (The New York Trilogy, #1)


The Lord of the Rings: Weapons and Warfare


In [16]:
userRecomments = model.recommendForAllUsers(5)
bookRecomments = model.recommendForAllItems(5)


In [17]:
userRecomments.select("user_id","recommendations.book_id").show(10, False)

+-------+------------------------------+
|user_id|book_id                       |
+-------+------------------------------+
|148    |[5207, 6920, 3628, 4483, 7593]|
|463    |[4067, 7767, 6296, 7364, 4881]|
|471    |[9157, 4344, 4963, 7327, 4403]|
|496    |[971, 2080, 9578, 9024, 1109] |
|833    |[4154, 1338, 4868, 6457, 5376]|
|1088   |[2209, 3628, 5880, 1788, 7254]|
|1238   |[8455, 4336, 5880, 2209, 3628]|
|1342   |[7275, 4706, 7789, 6971, 3722]|
|1580   |[4609, 4653, 709, 3628, 8522] |
|1591   |[4154, 8249, 6457, 5376, 4868]|
+-------+------------------------------+
only showing top 10 rows



In [20]:
bookRecomments.select("book_id","recommendations.user_id").show(10, False)

+-------+-----------------------------------+
|book_id|user_id                            |
+-------+-----------------------------------+
|1580   |[39869, 27969, 28956, 11548, 41388]|
|4900   |[52593, 12353, 33923, 11296, 15787]|
|5300   |[45674, 3054, 23124, 15449, 52593] |
|6620   |[10485, 1560, 21212, 27079, 30446] |
|7240   |[48324, 27969, 39869, 41388, 26346]|
|7340   |[30446, 21212, 25191, 39357, 10485]|
|7880   |[33211, 10664, 8753, 33923, 34995] |
|9900   |[52593, 33211, 15449, 17428, 33923]|
|471    |[25191, 21212, 30446, 51193, 28800]|
|1591   |[11548, 10485, 21212, 30446, 39357]|
+-------+-----------------------------------+
only showing top 10 rows



In [22]:
users = ratings_df.select("user_id").distinct().limit(3)
users.show()

+-------+
|user_id|
+-------+
|  32592|
|  19984|
|  35982|
+-------+



In [24]:
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|  32592|[{862, 4.74918}, ...|
|  35982|[{4609, 4.9298873...|
|  19984|[{862, 6.1798744}...|
+-------+--------------------+



In [25]:
userSubsetRecs.select("user_id","recommendations.book_id").show(10, False)

+-------+-----------------------------------------------------------+
|user_id|book_id                                                    |
+-------+-----------------------------------------------------------+
|32592  |[862, 5207, 6590, 5344, 3191, 267, 1342, 7305, 8109, 4483] |
|35982  |[4609, 4653, 3830, 8522, 3628, 2731, 4638, 125, 5146, 7710]|
|19984  |[862, 8109, 5207, 6920, 6590, 3628, 8187, 5580, 7401, 4483]|
+-------+-----------------------------------------------------------+



In [26]:

movies = ratings_df.select("book_id").distinct().limit(3)
movies.show()

+-------+
|book_id|
+-------+
|    148|
|    463|
|    471|
+-------+



In [28]:
movieSubSetRecs = model.recommendForItemSubset(movies, 10)
movieSubSetRecs.select("book_id","recommendations.user_id").show(10, False)

+-------+----------------------------------------------------------------------+
|book_id|user_id                                                               |
+-------+----------------------------------------------------------------------+
|471    |[25191, 21212, 30446, 51193, 28800, 32426, 11548, 50138, 32996, 32967]|
|463    |[25191, 14637, 40439, 12759, 14870, 50062, 38866, 16605, 21576, 44952]|
|148    |[11548, 39357, 25191, 30446, 21212, 38866, 38099, 17128, 10485, 22551]|
+-------+----------------------------------------------------------------------+



In [30]:
book_ids = [860,1524,2914,5297,7397,8802,9506]
user_ids = [4917,4917,4917,4917,4917,4917,4917]
new_user_preds = sqlContext.createDataFrame(zip(book_ids, user_ids),
                                            schema=['book_id', 'user_id'])

In [31]:
new_user_preds.show()

+-------+-------+
|book_id|user_id|
+-------+-------+
|    860|   4917|
|   1524|   4917|
|   2914|   4917|
|   5297|   4917|
|   7397|   4917|
|   8802|   4917|
|   9506|   4917|
+-------+-------+



In [32]:
new_predictions = model.transform(new_user_preds)
new_predictions.show()

+-------+-------+----------+
|book_id|user_id|prediction|
+-------+-------+----------+
|   2914|   4917| 3.3407302|
|    860|   4917| 3.3595057|
|   7397|   4917| 3.1630487|
|   8802|   4917| 3.3526936|
|   9506|   4917| 3.3080804|
|   1524|   4917| 3.2027965|
|   5297|   4917| 3.3230429|
+-------+-------+----------+

