In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.prediction_algorithms import knns
from surprise import accuracy
from surprise.prediction_algorithms import SVD
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import tensorflow as tf


In [2]:
spark = SparkSession\
        .builder\
        .appName('ALSExample').config('spark.driver.host', 'localhost')\
        .getOrCreate()

This part is getting the Goodreads data from json files into csv formats with only the important columns and rows. It takes around 30 minutes therefore I saved the csv files to data folder, project starts from there. The json files are more than 10GB therefore not included in the repo. IF needed they can be found at:https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home

# From json files into csv files

In [2]:
df_books1 = pd.read_json('goodreads_books_1.json', lines=True)
df_books2 = pd.read_json('goodreads_books_2.json', lines=True)
df_books3 = pd.read_json('goodreads_books_3.json', lines=True)
df_books4 = pd.read_json('goodreads_books_4.json', lines=True)
df_books5 = pd.read_json('goodreads_books_5.json', lines=True)
df_books6 = pd.read_json('goodreads_books_6.json', lines=True)
df_books7 = pd.read_json('goodreads_books_7.json', lines=True)

In [3]:
df_books1 = df_books1[['book_id','title','url','image_url']]
df_books2 = df_books2[['book_id','title','url','image_url']]
df_books3 = df_books3[['book_id','title','url','image_url']]
df_books4 = df_books4[['book_id','title','url','image_url']]
df_books5 = df_books5[['book_id','title','url','image_url']]
df_books6 = df_books6[['book_id','title','url','image_url']]
df_books7 = df_books7[['book_id','title','url','image_url']]

In [4]:
df_books = pd.concat([df_books1, df_books2, df_books3, df_books4, df_books5, df_books6, df_books7], ignore_index=True)

In [5]:
df_review = pd.read_json('goodreads_reviews_spoiler_raw.json', lines=True)

In [6]:
reviews_df = df_review[['user_id','book_id','rating']]

In [7]:
reviewed_books = set(reviews_df.book_id)
len(reviewed_books)

25475

In [8]:
reviewed_books = list(set(reviews_df.book_id))
len(reviewed_books)

25475

In [9]:
books = list(set(df_books.book_id))
len(books)

2360655

### This means all reviewed books' information is available in df_books

In [15]:
intersection = list(set(reviewed_books) & set(books))
set(intersection) == set(reviewed_books)

True

### We do not need all the books only those we have reviews about

In [19]:
df_books = df_books[df_books['book_id'].isin(intersection)]

In [42]:
users = list(set(reviews_df.user_id))
users_new = np.arange(len(users))
user_dict = dict(zip(users, users_new))
reviews_df['user_id'] = reviews_df['user_id'].map(user_dict)


In [8]:
genres_df = pd.read_json('data_book/genres.json', lines=True)

In [10]:
genres_df['genres_list'] = genres_df['genres'].map(lambda x: x.values())
genres_df.head()

Unnamed: 0,book_id,genres,genres_list
0,5333265,"{'history, historical fiction, biography': 1}",(1)
1,1333909,"{'fiction': 219, 'history, historical fiction,...","(219, 5)"
2,7327624,"{'fantasy, paranormal': 31, 'fiction': 8, 'mys...","(31, 8, 1, 1)"
3,6066819,"{'fiction': 555, 'romance': 23, 'mystery, thri...","(555, 23, 10)"
4,287140,{'non-fiction': 3},(3)


In [12]:
genres_df.genres[2]

{'fantasy, paranormal': 31,
 'fiction': 8,
 'mystery, thriller, crime': 1,
 'poetry': 1}

In [44]:
reviews_df.to_csv('data_book/reviews.csv', index=False)
df_books.to_csv('data_book/books.csv', index=False)

# Project Start

In [3]:
reviews_df = pd.read_csv('data_book/reviews.csv')
books_df = pd.read_csv('data_book/books.csv')

In [22]:
reviews_df.head()

Unnamed: 0,user_id,book_id,rating
0,13974,18245960,5
1,13974,16981,3
2,13974,28684704,3
3,13974,27161156,0
4,13974,25884323,4


In [23]:
books_df.head()

Unnamed: 0,book_id,title,url,image_url
0,22642971,The Body Electric,https://www.goodreads.com/book/show/22642971-t...,https://images.gr-assets.com/books/1406979059m...
1,32336119,Worth the Wait (Guthrie Brothers #2),https://www.goodreads.com/book/show/32336119-w...,https://images.gr-assets.com/books/1481574438m...
2,2741853,Slow Hands,https://www.goodreads.com/book/show/2741853-sl...,https://s.gr-assets.com/assets/nophoto/book/11...
3,12077902,Solaris: The Definitive Edition,https://www.goodreads.com/book/show/12077902-s...,https://images.gr-assets.com/books/1397505604m...
4,7843586,"More (More, #1)",https://www.goodreads.com/book/show/7843586-more,https://s.gr-assets.com/assets/nophoto/book/11...


In [30]:
reviews_tfds = tf.data.Dataset.from_tensor_slices(dict(reviews_df)).map(lambda x: {
    "user_id": str(x["user_id"]),
    "book_id": str(x["book_id"]),
    "rating": float(x["rating"])
})
reviews_tfds

<MapDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'book_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'rating': TensorSpec(shape=(), dtype=tf.float32, name=None)}>

In [None]:
from datasets import load_dataset


In [41]:
dataset = load_dataset("tweet_eval", "emotion")


100%|██████████| 3/3 [00:00<00:00, 433.21it/s]


In [45]:
dataset.to_list()

AttributeError: 'DatasetDict' object has no attribute 'to_list'

In [6]:
books_df.head()

Unnamed: 0,book_id,title,url,image_url
0,22642971,The Body Electric,https://www.goodreads.com/book/show/22642971-t...,https://images.gr-assets.com/books/1406979059m...
1,32336119,Worth the Wait (Guthrie Brothers #2),https://www.goodreads.com/book/show/32336119-w...,https://images.gr-assets.com/books/1481574438m...
2,2741853,Slow Hands,https://www.goodreads.com/book/show/2741853-sl...,https://s.gr-assets.com/assets/nophoto/book/11...
3,12077902,Solaris: The Definitive Edition,https://www.goodreads.com/book/show/12077902-s...,https://images.gr-assets.com/books/1397505604m...
4,7843586,"More (More, #1)",https://www.goodreads.com/book/show/7843586-more,https://s.gr-assets.com/assets/nophoto/book/11...


In [3]:
reviews = spark.read.csv('./data_book/reviews.csv', header='true', inferSchema='true')
books = spark.read.csv('./data_book/books.csv', header='true', inferSchema='true')

In [5]:
ratings = tfds.load("movielens/100k-ratings", split="train")


[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /Users/ferityikar/tensorflow_datasets/movielens/100k-ratings/0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  1.36 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  1.36 url/s]
[A
Dl Completed...: 100%|██████████| 1/1 [00:01<00:00,  1.36 url/s]
Extraction completed...: 100%|██████████| 1/1 [00:01<00:00,  1.05s/ file]
Dl Size...: 100%|██████████| 4/4 [00:01<00:00,  3.78 MiB/s]
Dl Completed...: 100%|██████████| 1/1 [00:01<00:00,  1.07s/ url]


[1mDataset movielens downloaded and prepared to /Users/ferityikar/tensorflow_datasets/movielens/100k-ratings/0.1.0. Subsequent calls will reuse this data.[0m


In [30]:
r = pd.read_csv('./data_book/reviews.csv')

In [31]:
r

Unnamed: 0,user_id,book_id,rating
0,8842281e1d1347389f2ab93d60773d4d,18245960,5
1,8842281e1d1347389f2ab93d60773d4d,16981,3
2,8842281e1d1347389f2ab93d60773d4d,28684704,3
3,8842281e1d1347389f2ab93d60773d4d,27161156,0
4,8842281e1d1347389f2ab93d60773d4d,25884323,4
...,...,...,...
1378028,35cef391b171b4fca45771e508028212,15745950,0
1378029,35cef391b171b4fca45771e508028212,10861195,0
1378030,35cef391b171b4fca45771e508028212,6131164,4
1378031,35cef391b171b4fca45771e508028212,10025305,4


In [32]:
reviews

DataFrame[user_id: string, book_id: int, rating: int]

In [33]:
(train, test) = reviews.randomSplit([0.7, 0.3])

## Building ALS Model

### Cross Validation to get best parameters

In [34]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5,rank=4, regParam=0.01, userCol='user_id', itemCol='book_id', ratingCol='rating',
          coldStartStrategy='drop')

# fit the ALS model to the training set
model = als.fit(train)

IllegalArgumentException: requirement failed: Column user_id must be of type numeric but was actually of type string.

In [23]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating',
                                predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print('Root-mean-square error = ' + str(rmse))

IllegalArgumentException: requirement failed: Nothing has been added to this summarizer.

In [5]:
# Model
model_als = ALS(
        userCol="user_id", 
        itemCol="book_id",
        ratingCol="rating", 
        nonnegative = True, 
        implicitPrefs = False,
        coldStartStrategy="drop"
)
# Parameters
params = ParamGridBuilder()\
          .addGrid(model_als.regParam, [0.001, 0.01, 0.1])\
          .addGrid(model_als.rank, [4, 10, 50, 100])\
          .build()

# Evaluator
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating',
                                predictionCol='prediction')

# Cross Validator
cv = CrossValidator(estimator=model_als, estimatorParamMaps=params,\
                 evaluator=evaluator, parallelism=4)

best_model = cv.fit(train)

print("Best Model")
# Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())
# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

IllegalArgumentException: requirement failed: Nothing has been added to this summarizer.

In [65]:
a = books[books['book_id'] == 29056083].url.values
books[books['book_id'] == 29056083]

Unnamed: 0,book_id,title,url,image_url
17418,29056083,Harry Potter and the Cursed Child - Parts One ...,https://www.goodreads.com/book/show/29056083-h...,https://images.gr-assets.com/books/1470082995m...


In [27]:
reviews_tf = tf.data.Dataset.from_tensor_slices(dict(reviews))

In [28]:
reviews_tf

<TensorSliceDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'book_id': TensorSpec(shape=(), dtype=tf.int64, name=None), 'rating': TensorSpec(shape=(), dtype=tf.int64, name=None)}>

## Surprise

In [20]:
reader = Reader(line_format=u'user item rating', sep=',', rating_scale=(1, 6), skip_lines=1)
reviews_data = Dataset.load_from_file("data_book/reviews.csv", reader=reader)


In [21]:
train, test = train_test_split(reviews_data, test_size=.2)

In [22]:
print('Number of users: ', train.n_users, '\n')
print('Number of items: ', train.n_items, '\n')

Number of users:  1102426 

Number of items:  25475 



In [24]:
sim_cos = {'name':'cosine', 'user_based':False} # false because a lot more users than items

In [35]:
basic = KNNBasic(sim_options=sim_cos)
basic.fit(train)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fbd43bbd520>

In [36]:
predictions = basic.test(test)

In [37]:
print(accuracy.rmse(predictions))

RMSE: 1.2516
1.2515671427556392


In [40]:
sim_pearson = {'name':'pearson', 'user_based':False}
basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
basic_pearson.fit(train)
predictions = basic_pearson.test(test)
print(accuracy.rmse(predictions))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.2516
1.2515671427556392


In [41]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_means = knns.KNNWithMeans(sim_options=sim_pearson)
knn_means.fit(train)
predictions = knn_means.test(test)
print(accuracy.rmse(predictions))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.2516
1.2515671427556392


In [42]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_baseline = knns.KNNBaseline(sim_options=sim_pearson)
knn_baseline.fit(train)
predictions = knn_baseline.test(test)
print(accuracy.rmse(predictions))

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.1904
1.1903534539771883


In [45]:
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(train)
predictions = svd.test(test)
print(accuracy.rmse(predictions))

RMSE: 1.1950
1.1950101593550553


## Tensorflow

In [50]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [51]:
reviews = pd.read_csv('./data_book/reviews.csv')
books = pd.read_csv('./data_book/books.csv')

In [52]:
tf.data.Dataset.from_tensor_slices(dict(dict(reviews)))



<TensorSliceDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.int64, name=None), 'book_id': TensorSpec(shape=(), dtype=tf.int64, name=None), 'rating': TensorSpec(shape=(), dtype=tf.int64, name=None)}>

In [57]:
books.url[0]

'https://www.goodreads.com/book/show/22642971-the-body-electric'

In [53]:
tf.data.Dataset.from_tensor_slices(dict(dict(books)))


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).