In [1]:
# Import Dependencies
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType
from pyspark.sql.functions import col, explode, when, sum, size
import pandas as pd



# Initialize Spark Session
spark = SparkSession.builder.appName("BookRatingPrediction").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/29 00:04:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/29 00:04:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/11/29 00:04:18 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
# Read the data back in from the multiple files
df = spark.read.json("../../Resources/data_chunks")

# List of columns to drop
columns_to_drop = ['asin','country_code', 'description', 'isbn', 'similar_books', 'title_without_series','is_ebook', 'work_id' , 'link', 'image_url', 'url', 'edition_information', 'kindle_asin','language_code', 'publication_day', 'publication_month', 'publisher', 'title']

# Dropping unnecessary columns
df = df.drop(*columns_to_drop)

# Convert data types
df = df.withColumn("average_rating", col("average_rating").cast(FloatType())) \
    .withColumn("ratings_count", col("ratings_count").cast(IntegerType())) \
    .withColumn("num_pages", col("num_pages").cast(IntegerType())) \
    .withColumn("text_reviews_count", col("text_reviews_count").cast(IntegerType()))

# drop rows with null values
df = df.dropna(how='any')

# Show the first few rows to verify
df.show()

                                                                                

+--------------------+--------------+--------+--------------+-------------+---------+--------------------+----------------+-------------+--------+------------------+
|             authors|average_rating| book_id|        format|       isbn13|num_pages|     popular_shelves|publication_year|ratings_count|  series|text_reviews_count|
+--------------------+--------------+--------+--------------+-------------+---------+--------------------+----------------+-------------+--------+------------------+
|[{9759, }, {47445...|           3.8|26228189| Audible Audio|             |       14|[{856, to-read}, ...|            2015|           14|[145513]|                 8|
|[{6588332, }, {65...|          4.12|16171688|     Paperback|9781480227880|      294|[{111, to-read}, ...|            2012|           45|      []|                19|
|       [{9880719, }]|          4.71|24912692|     Paperback|9780615948782|      243|[{254, to-read}, ...|            2014|            7|      []|                 2|
|   

In [3]:
# Print the schema to see the types
df.printSchema()

root
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: string (nullable = true)
 |    |    |-- role: string (nullable = true)
 |-- average_rating: float (nullable = true)
 |-- book_id: string (nullable = true)
 |-- format: string (nullable = true)
 |-- isbn13: string (nullable = true)
 |-- num_pages: integer (nullable = true)
 |-- popular_shelves: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- count: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- publication_year: string (nullable = true)
 |-- ratings_count: integer (nullable = true)
 |-- series: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- text_reviews_count: integer (nullable = true)



In [4]:
# To filter out rows with empty strings in any column
df_cleaned = df.filter(
    (col("country_code").isNotNull() & (col("country_code") != "")) &
    (col("format").isNotNull() & (col("format") != "")) &
    (col("publication_year").isNotNull() & (col("publication_year") != "")) &
    col("num_pages").isNotNull() &
    col("ratings_count").isNotNull() &
    col("text_reviews_count").isNotNull() & col("average_rating").isNotNull()
    & (col("isbn13").isNotNull() & (col("isbn13") != ""))
)

df_cleaned.show()

+--------------------+--------------+--------+--------------------+-------------+---------+--------------------+----------------+-------------+--------+------------------+
|             authors|average_rating| book_id|              format|       isbn13|num_pages|     popular_shelves|publication_year|ratings_count|  series|text_reviews_count|
+--------------------+--------------+--------+--------------------+-------------+---------+--------------------+----------------+-------------+--------+------------------+
|[{6588332, }, {65...|          4.12|16171688|           Paperback|9781480227880|      294|[{111, to-read}, ...|            2012|           45|      []|                19|
|       [{9880719, }]|          4.71|24912692|           Paperback|9780615948782|      243|[{254, to-read}, ...|            2014|            7|      []|                 2|
|         [{36714, }]|           3.9|10107215|           Hardcover|9788804410355|      444|[{1043, to-read},...|            1996|           

In [5]:
# Drop duplicates
df = df_cleaned.dropDuplicates(['isbn13'])

In [6]:
# Select books published after 1900
df = df.filter(col("publication_year") > 1900)
df = df.filter(col("publication_year") < 2024)

# Select books with rating more than or equal to 1
df = df.filter(col("average_rating") >= 1)

# Select books with more than 10 ratings
df = df.filter(col("ratings_count") >= 10)

# Select books with number of pages less than 1500
df = df.filter(col("num_pages") <= 1500)

df.count()

                                                                                

838200

In [7]:
# Path to your .json.gz file
file2_path = "Resources/data/goodreads_book_authors.json.gz"

# Read the compressed JSON file into a DataFrame
author_rating_df = spark.read.json(file2_path)

# Show the first few rows to verify
author_rating_df.show()

+---------+--------------+-------------------+-------------+------------------+
|author_id|average_rating|               name|ratings_count|text_reviews_count|
+---------+--------------+-------------------+-------------+------------------+
|   604031|          3.98|   Ronald J. Fields|           49|                 7|
|   626222|          4.08|      Anita Diamant|       546796|             28716|
|    10333|          3.92|     Barbara Hambly|       122118|              5075|
|     9212|          3.68|    Jennifer Weiner|       888522|             36262|
|   149918|          3.82|      Nigel Pennick|         1740|                96|
|  3041852|          3.89|   Alfred J. Church|          947|                85|
|   215594|          4.17| Michael Halberstam|           23|                 6|
|    19158|          4.18|     Rachel Roberts|        13677|               486|
|  5807700|          3.99|         V.L. Locey|         3130|               986|
|  2983296|          3.48|Anton Szandor 

                                                                                

In [8]:
# Drop unnecessary columns
author_rating_df = author_rating_df.drop('name', 'ratings_count', 'text_reviews_count')

In [9]:
# Select only the 'authors' and 'isbn13' columns from the main DataFrame
df_authors_isbn = df.select("authors", "isbn13")

In [10]:
# Explode the 'authors' column to create a new row for each author
df_authors_exploded = df_authors_isbn.withColumn("author", explode("authors"))

# Extract the author_id from the struct
df_authors_exploded = df_authors_exploded.withColumn("unique_author_id", col("author.author_id"))
df_authors_exploded = df_authors_exploded.drop(*["author", "authors"])
df_authors_exploded.show()



+-------------+----------------+
|       isbn13|unique_author_id|
+-------------+----------------+
|0000000067317|            1654|
|0000195118146|          224783|
|0000195118146|          363486|
|0000195397916|         1842587|
|0000262541785|          322903|
|0000340826681|           18174|
|0000415458250|           21233|
|0000785116583|           10297|
|0000785116583|           10294|
|0000814474233|          325296|
|0000815774141|         2020614|
|0000847691225|         1101012|
|0000847691225|          537802|
|0000977316351|         1375008|
|0001442206098|           65398|
|0001442206098|         4623567|
|000155206591x|          871368|
|0001603705341|         2734991|
|0001607061562|         3280622|
|0001843608081|          108007|
+-------------+----------------+
only showing top 20 rows



                                                                                

In [11]:
# Join with author ratings DataFrame
df_authors_with_ratings = df_authors_exploded.join(author_rating_df, df_authors_exploded.unique_author_id == author_rating_df.author_id, "left")
df_authors_with_ratings.show()



+-------------+----------------+---------+--------------+
|       isbn13|unique_author_id|author_id|average_rating|
+-------------+----------------+---------+--------------+
|0000000067317|            1654|     1654|          4.14|
|0000195118146|          224783|   224783|          3.90|
|0000195118146|          363486|   363486|          3.83|
|0000195397916|         1842587|  1842587|          3.37|
|0000262541785|          322903|   322903|          4.06|
|0000340826681|           18174|    18174|          4.10|
|0000415458250|           21233|    21233|          3.51|
|0000785116583|           10297|    10297|          3.81|
|0000785116583|           10294|    10294|          3.83|
|0000814474233|          325296|   325296|          3.62|
|0000815774141|         2020614|  2020614|          3.99|
|0000847691225|         1101012|  1101012|          3.33|
|0000847691225|          537802|   537802|          3.57|
|0000977316351|         1375008|  1375008|          3.60|
|0001442206098

                                                                                

In [12]:
# Drop author ids columns
df_authors_with_ratings = df_authors_with_ratings.drop(*["author_id", "unique_author_id"])

# Aggregate the author ratings for each book
df_authors_agg = df_authors_with_ratings.groupBy("isbn13").agg({"average_rating": "avg"})
df_authors_agg.show()

                                                                                

+-------------+-------------------+
|       isbn13|avg(average_rating)|
+-------------+-------------------+
|0000000067317|               4.14|
|0000195118146|              3.865|
|0000195397916|               3.37|
|0000262541785|               4.06|
|0000340826681|                4.1|
|0000415458250|               3.51|
|0000785116583| 3.8200000000000003|
|0000814474233|               3.62|
|0000815774141|               3.99|
|0000847691225|               3.45|
|0000977316351|                3.6|
|0001442206098|              3.545|
|000155206591x|               3.77|
|0001603705341|               3.52|
|0001607061562|               4.07|
|0001843608081|               3.93|
|0008520919197|               3.97|
|0008987035145|               4.16|
|0008987086307|               4.35|
|0009770168831|              3.565|
+-------------+-------------------+
only showing top 20 rows



In [13]:
# Join the authors average rating with the main DataFrame
df = df.join(df_authors_agg, "isbn13", "left")
df.show()

23/11/29 00:06:01 WARN MemoryStore: Not enough space to cache broadcast_35 in memory! (computed 96.0 MiB so far)
23/11/29 00:06:01 WARN BlockManager: Persisting block broadcast_35 to disk instead.
[Stage 27:>                                                         (0 + 1) / 1]

+-------------+--------------------+--------------+-------+--------------------+---------+--------------------+----------------+-------------+----------------+------------------+-------------------+
|       isbn13|             authors|average_rating|book_id|              format|num_pages|     popular_shelves|publication_year|ratings_count|          series|text_reviews_count|avg(average_rating)|
+-------------+--------------------+--------------+-------+--------------------+---------+--------------------+----------------+-------------+----------------+------------------+-------------------+
|0000814474233|        [{325296, }]|          3.62| 598454|           Hardcover|      220|[{154, to-read}, ...|            2007|           97|              []|                19|               3.62|
|0008520919197|        [{123715, }]|          4.07|3623612|           Paperback|      288|[{2944, mystery},...|            2006|           21|[227086, 356197]|                 2|               3.97|
|0009

                                                                                

In [14]:
# Drop unnecessary columns
df = df.drop(*["authors", "popular_shelves", "series"])

# Rename the columns
df = df.withColumnRenamed("avg(average_rating)", "author_average_rating") \
    .withColumnRenamed("total_to_read_count", "to_read_count")

df.show()



+-------------+--------------+--------+---------+---------+----------------+-------------+------------------+---------------------+
|       isbn13|average_rating| book_id|   format|num_pages|publication_year|ratings_count|text_reviews_count|author_average_rating|
+-------------+--------------+--------+---------+---------+----------------+-------------+------------------+---------------------+
|0000000067317|          4.36| 2745937| Hardback|      298|            1994|           27|                 2|                 4.14|
|0000195118146|          3.75|  679038|Paperback|      448|            1997|           44|                 4|                3.865|
|0000195397916|          3.32| 7272274|Hardcover|      208|            2010|           86|                15|                 3.37|
|0000262541785|          4.06|  593335|Paperback|      233|            2004|          138|                11|                 4.06|
|0000340826681|          4.09|  332824|Paperback|      288|            2004|

                                                                                

In [15]:
# Extract a sample of the DataFrame
df_sample = df.sample(False, 0.1, seed=42)
df_sample_pandas = df_sample.toPandas()
df_sample_pandas.to_csv("../../Resources/panda_df/books_sample_trial2.csv", index=False)

23/11/29 00:08:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/11/29 00:08:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/11/29 00:08:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/11/29 00:08:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/11/29 00:08:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/11/29 00:08:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/11/29 00:08:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/11/29 00:08:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

In [16]:
spark.stop()